In [1]:
import os
import warnings
from typing import List, Tuple

import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer

tqdm.pandas()
warnings.filterwarnings("ignore")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
2024-11-15 00:25:33.347719: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-15 00:25:33.361813: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by 

In [2]:
def clean_text(text_series: pd.Series) -> pd.Series:
    """
    Cleans text data by removing URLs and HTML entities.
    """
    # Remove URLs
    text_series = text_series.str.replace(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)

    # Remove HTML
    text_series = text_series.str.replace('&gt;', '')

    return text_series

def add_datetime_columns(df: pd.DataFrame, time_column: str = 'created_utc') -> pd.DataFrame:
    """
    Adds 'Y' (year) and 'YM' (year-month) columns to the DataFrame based on a timestamp column.
    """
    # Convert timestamp to datetime and extract the year
    df['Y'] = pd.to_datetime(df[time_column], unit='s').dt.year

    # Extract year-month in 'YYYY-MM' format
    df['YM'] = pd.to_datetime(df[time_column], unit='s').dt.strftime('%Y-%m')

    return df

def process_subreddit(data_root: str, subreddit_names: List[str], min_num_comments: int = 3, min_score: int = -1, years: List[int] = [2016], chunk_size: int = 10**6) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes submissions and comments for a given subreddit.
    """
    all_submissions = pd.DataFrame()
    all_comments = pd.DataFrame()

    for subreddit_name in subreddit_names:
        # Read submissions CSV file
        submissions = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_submissions.csv")

        # Add 'Y' and 'YM' columns to submissions DataFrame
        submissions = add_datetime_columns(submissions, 'created_utc')

        # Filter submissions by specified years
        submissions = submissions[submissions['Y'].isin(years)]

        # Filter submissions based on minimum score and number of comments
        submissions = submissions[submissions['score'] > min_score]
        submissions = submissions[submissions['num_comments'] >= min_num_comments]

        # Add subreddit name to DataFrame
        submissions['sub'] = subreddit_name

        print(f"{subreddit_name} submissions: {len(submissions)}")

        # Read comments CSV file in chunks
        comments_chunks = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_comments.csv", chunksize=chunk_size)
        comments = pd.DataFrame()

        for i, chunk in enumerate(comments_chunks):
            chunk['link_id'] = chunk['link_id'].str.replace('t3_', '')

            # Keep only comments linked to the filtered submissions
            chunk = chunk[chunk['link_id'].isin(submissions['id'].unique())]

            # Remove prefix from 'parent_id'
            chunk['parent_id'] = chunk['parent_id'].str[3:]

            # Add 'Y' and 'YM' columns to comments DataFrame
            chunk = add_datetime_columns(chunk, 'created_utc')

            # Break the loop if the chunk's years are beyond the specified range
            if chunk['Y'].min() > max(years):
                break

            # Clean the 'body' text in comments
            chunk['body'] = clean_text(chunk['body'])

            # Add subreddit name to DataFrame
            chunk['sub'] = subreddit_name

            # Concatenate the processed chunk to the main comments DataFrame
            comments = pd.concat([comments, chunk], ignore_index=True)

        # Remove 't3_' prefix from 'link_id' in comments (redundant but kept for consistency)
        comments['link_id'] = comments['link_id'].str.replace('t3_', '')

        # Keep only comments linked to the filtered submissions
        comments = comments[comments['link_id'].isin(submissions['id'].unique())]

        print(f"{subreddit_name} comments: {len(comments)}")
        
        all_submissions = pd.concat([all_submissions, submissions], ignore_index=True)
        all_comments = pd.concat([all_comments, comments], ignore_index=True)

    return all_submissions, all_comments

submissions, comments = process_subreddit(
    data_root="data/subreddits", 
    subreddit_names=["business", "climate", "energy", "labor", "education", "news"],
    min_num_comments=3,
    min_score=-1,
    years=range(2016, 2017)
)

# Filter submissions to only include those with comments
submissions = submissions[submissions['sub'].isin(comments['sub'].unique())]

print('Overall Submissions:', len(submissions))
print('Overall Comments:', len(comments))

business submissions: 3929
business comments: 75980
climate submissions: 1483
climate comments: 14174
energy submissions: 2631
energy comments: 49671
labor submissions: 262
labor comments: 1874
education submissions: 1584
education comments: 21415
news submissions: 55639
news comments: 7286594
Overall Submissions: 65528
Overall Comments: 7449708


In [3]:
# Get the credibility information for each domain
domain_credibility = pd.read_csv("data/domain_credibility.csv", index_col=0, header=0, names=['domain', 'bias', 'credibility'])

# Merge credibility information with submissions on domain
submissions = submissions.merge(domain_credibility, left_on='domain', right_on='domain', how='left')

# Drop submissions with missing credibility information
submissions = submissions.dropna(subset=['bias', 'credibility'])

# Remove submissions from [deleted] authors
submissions = submissions[submissions['author'] != '[deleted]']

# Calculate the average credibility rating for each author
author_credibility = submissions.groupby('author', as_index=False)['credibility'].mean()

# Remove comments from authors with no credibility information
comments = comments[comments['author'].isin(author_credibility["author"])]

In [4]:
# Embed the text data for comments
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")  # You can choose other models as well

comments["embedding"] = comments["body"].progress_apply(lambda x: np.array(model.encode(x)))

# Calculate the average embedding for each author
author_embeddings = comments.groupby('author', as_index=False)['embedding'].apply(lambda x: np.mean(np.vstack(x), axis=0).tolist())

embed_dim = len(author_embeddings['embedding'].iloc[0])

100%|██████████| 444370/444370 [14:25<00:00, 513.20it/s]


In [5]:
# TODO: Include links to the original submissions in the comments DataFrame
# posts = pd.concat([submissions[["author", "id"]].rename(columns={"id": "link_id"}), comments[["author", "link_id"]]])

# Calculate the number of shared comments between each pair of authors
frequencies_df = pd.crosstab(comments["author"], comments['link_id'])

# Associate credibility and embeddings with authors
credibilities = frequencies_df.merge(author_credibility, on="author", how='left')["credibility"].to_list()
embeddings = frequencies_df.merge(author_embeddings, on="author", how='left')["embedding"].to_list()

frequencies = np.array(frequencies_df, dtype=float)

adjacency_matrix = frequencies @ frequencies.T
np.fill_diagonal(adjacency_matrix, 0)

# Connect authors with at least n shared comments
adjacency_matrix = (adjacency_matrix >= 1).astype(int)

In [6]:
# Create a graph from the adjacency matrix
graph = nx.from_numpy_array(adjacency_matrix)

In [7]:
# Add betweeness centrality, clustering coefficient, degree, and credibility as node attributes
betweenness = {i: float(b) for i, b in nx.betweenness_centrality(graph).items()}
nx.set_node_attributes(graph, betweenness, 'betweenness')

clustering = {i: float(c) for i, c in nx.clustering(graph).items()}
nx.set_node_attributes(graph, clustering, 'clustering')

degree = dict(nx.degree(graph))
nx.set_node_attributes(graph, degree, 'degree')

credibility_dict = {i: credibility for i, credibility in enumerate(credibilities)}
nx.set_node_attributes(graph, credibility_dict, 'credibility')

embedding_dict = {i: {} for i in range(embed_dim)}
for i, embedding in enumerate(embeddings):
    for j in range(embed_dim):
        embedding_dict[j][i] = embedding[j]
    
for i in range(embed_dim):
    nx.set_node_attributes(graph, embedding_dict[i], f'embedding_{i}')

nx.write_gexf(graph, 'data/reddit.gexf')