In [1]:
import os
import warnings
from typing import List, Tuple

import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
def clean_text(text_series: pd.Series) -> pd.Series:
    """
    Cleans text data by removing URLs and HTML entities.
    """
    # Remove URLs
    text_series = text_series.str.replace(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)

    # Remove HTML
    text_series = text_series.str.replace('&gt;', '')

    return text_series

def add_datetime_columns(df: pd.DataFrame, time_column: str = 'created_utc') -> pd.DataFrame:
    """
    Adds 'Y' (year) and 'YM' (year-month) columns to the DataFrame based on a timestamp column.
    """
    # Convert timestamp to datetime and extract the year
    df['Y'] = pd.to_datetime(df[time_column], unit='s').dt.year

    # Extract year-month in 'YYYY-MM' format
    df['YM'] = pd.to_datetime(df[time_column], unit='s').dt.strftime('%Y-%m')

    return df

def process_subreddit(data_root: str, subreddit_names: List[str], min_num_comments: int = 3, min_score: int = -1, years: List[int] = [2016], chunk_size: int = 10**6) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes submissions and comments for a given subreddit.
    """
    all_submissions = pd.DataFrame()
    all_comments = pd.DataFrame()

    for subreddit_name in subreddit_names:
        # Read submissions CSV file
        submissions = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_submissions.csv")

        # Add 'Y' and 'YM' columns to submissions DataFrame
        submissions = add_datetime_columns(submissions, 'created_utc')

        # Filter submissions by specified years
        submissions = submissions[submissions['Y'].isin(years)]

        # Filter submissions based on minimum score and number of comments
        submissions = submissions[submissions['score'] > min_score]
        submissions = submissions[submissions['num_comments'] >= min_num_comments]

        # Add subreddit name to DataFrame
        submissions['sub'] = subreddit_name

        print(f"{subreddit_name} submissions: {len(submissions)}")

        # Read comments CSV file in chunks
        comments_chunks = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_comments.csv", chunksize=chunk_size)
        comments = pd.DataFrame()

        for i, chunk in enumerate(comments_chunks):
            chunk['link_id'] = chunk['link_id'].str.replace('t3_', '')

            # Keep only comments linked to the filtered submissions
            chunk = chunk[chunk['link_id'].isin(submissions['id'].unique())]

            # Remove prefix from 'parent_id'
            chunk['parent_id'] = chunk['parent_id'].str[3:]

            # Add 'Y' and 'YM' columns to comments DataFrame
            chunk = add_datetime_columns(chunk, 'created_utc')

            # Break the loop if the chunk's years are beyond the specified range
            if chunk['Y'].min() > max(years):
                break

            # Clean the 'body' text in comments
            chunk['body'] = clean_text(chunk['body'])

            # Add subreddit name to DataFrame
            chunk['sub'] = subreddit_name

            # Concatenate the processed chunk to the main comments DataFrame
            comments = pd.concat([comments, chunk], ignore_index=True)

        # Remove 't3_' prefix from 'link_id' in comments (redundant but kept for consistency)
        comments['link_id'] = comments['link_id'].str.replace('t3_', '')

        # Keep only comments linked to the filtered submissions
        comments = comments[comments['link_id'].isin(submissions['id'].unique())]

        print(f"{subreddit_name} comments: {len(comments)}")
        
        all_submissions = pd.concat([all_submissions, submissions], ignore_index=True)
        all_comments = pd.concat([all_comments, comments], ignore_index=True)

    return all_submissions, all_comments

submissions, comments = process_subreddit(
    data_root="data/subreddits", 
    subreddit_names=["business", "climate", "energy", "labor", "education", "news"],
    min_num_comments=3,
    min_score=-1,
    years=range(2016, 2017)
)

# Filter submissions to only include those with comments
submissions = submissions[submissions['sub'].isin(comments['sub'].unique())]

print('Overall Submissions:', len(submissions))
print('Overall Comments:', len(comments))

business submissions: 3929
business comments: 75980
climate submissions: 1483
climate comments: 14174
energy submissions: 2631
energy comments: 49671
labor submissions: 262
labor comments: 1874
education submissions: 1584
education comments: 21415
news submissions: 55639
news comments: 7286594
Overall Submissions: 65528
Overall Comments: 7449708


In [11]:
# Get the credibility information for each domain
domain_credibility = pd.read_csv("data/domain_credibility.csv", index_col=0, header=0, names=['domain', 'bias', 'credibility'])

# Merge credibility information with submissions on domain
submissions = submissions.merge(domain_credibility, left_on='domain', right_on='domain', how='left')

# Drop submissions with missing credibility information
submissions = submissions.dropna(subset=['bias', 'credibility'])

# Remove submissions from [deleted] authors
submissions = submissions[submissions['author'] != '[deleted]']

# Calculate the average credibility rating for each author
author_credibility = submissions.groupby('author', as_index=False)['credibility'].mean()

# Remove comments from authors with no credibility information
comments = comments[comments['author'].isin(author_credibility["author"])]

KeyError: ['bias', 'credibility']

In [13]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")
comments["embedding"] = comments["body"].progress_apply(lambda x: np.array(sentence_transformer.encode(x)))

100%|██████████| 444370/444370 [14:55<00:00, 496.03it/s]


In [4]:
# TODO: Include links to the original submissions in the comments DataFrame
# posts = pd.concat([submissions[["author", "id"]].rename(columns={"id": "link_id"}), comments[["author", "link_id"]]])

# Calculate the number of shared comments between each pair of authors
frequencies_df = pd.crosstab(comments["author"], comments['link_id'])

credibilities = frequencies_df.merge(author_credibility, on="author", how='left')["credibility"].to_list()

frequencies = np.array(frequencies_df, dtype=float)

adjacency_matrix = frequencies @ frequencies.T
np.fill_diagonal(adjacency_matrix, 0)

# Connect authors with at least n shared comments
adjacency_matrix = (adjacency_matrix >= 1).astype(int)

In [5]:
# Create a graph from the adjacency matrix
graph = nx.from_numpy_array(adjacency_matrix)

In [55]:
comments_indexed = comments.set_index(['id', 'author'])
comments_indexed.xs('eigenman', level='author')

Unnamed: 0_level_0,Index,created_utc,parent_id,link_id,body,Y,YM,sub,embedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cyic7p5,991812,1451664317,3z0hsy,3z0hsy,We welcome our new lords and masters.,2016,2016-01,business,"[0.032246627, 0.012166687, 0.06292515, -0.0576..."
cz3j7xt,997824,1453174086,41kafd,41kafd,Good.,2016,2016-01,business,"[-0.088112794, -0.012578506, -0.06211385, 0.04..."
czq50ml,1003413,1454780155,44gecs,44gecs,It was broad-based: The only category where re...,2016,2016-02,business,"[0.019247988, -0.027884975, 0.09817463, 0.0848..."
d01zafr,1006501,1455638118,461wa8,461wa8,Oil price is dropping today heh.,2016,2016-02,business,"[-0.018226402, 0.007058047, 0.15740283, 0.0377..."
d0su8nd,1013078,1457488583,d0ss9in,49i8z3,Different areas have different drilling/fracki...,2016,2016-03,business,"[0.07944631, 0.0022230602, 0.0765335, 0.072117..."
...,...,...,...,...,...,...,...,...,...
dbp5vgt,21630897,1482886411,5kju39,5kju39,Go to bed Israel. You're drunk.,2016,2016-12,news,"[0.14629324, 0.026536558, 0.009616932, -0.0032..."
dbp5ww8,21630903,1482886472,5klivq,5klivq,Wow that UN vote had more effect than I though...,2016,2016-12,news,"[-0.0006077492, 0.021225117, 0.02102755, -0.01..."
dbrr3rj,21668605,1483047197,5kyj7x,5kyj7x,Time to short bitcoin.,2016,2016-12,news,"[0.025361173, 0.10327468, 0.005644338, 0.01683..."
dbrzi7j,21672633,1483058733,dbrywow,5kyj7x,Sure. The old fashioned way. Borrow some bit...,2016,2016-12,news,"[-0.03206374, 0.03921009, -0.05505047, -0.0132..."


In [76]:
# def get_cred_sim(parent_author_credibility, user_body, parent_id):
#     # Get embeddings
#     user_embedding = sentence_transformer.encode(user_body).reshape(1, -1)
#     parent_embedding = comments_indexed.xs(parent_id, level='author')["embedding"].values[0].reshape(1, -1)

#     # Calculate similarity
#     similarity = cosine_similarity(user_embedding, parent_embedding)[0][0]
    
#     # Normalize credibility to [-1, 1]
#     parent_author_credibility = (parent_author_credibility * 2 - 1)

#     return similarity * parent_author_credibility

def get_cred_sim(parent_author_credibility, user_body, parent_body):
    # Get embeddings
    user_embedding = sentence_transformer.encode(user_body).reshape(1, -1)
    parent_embedding = sentence_transformer.encode(parent_body).reshape(1, -1)

    # Calculate similarity
    similarity = cosine_similarity(user_embedding, parent_embedding)[0][0]
    
    # Normalize credibility to [-1, 1]
    parent_author_credibility = (parent_author_credibility * 2 - 1)

    return similarity * parent_author_credibility

In [77]:
def create_cred_sims():
    # Create df for author similarities
    author_cred_sims = pd.DataFrame(columns=['author', 'similarity'])

    # Loop through all users
    users = comments['author'].unique()
    for user in tqdm(users):

        # Array for storing similarity values
        cred_sims = []

        # Loop through all comments
        user_comments = comments[comments['author'] == user]
        for user_comment in user_comments.iterrows():
            user_comment = user_comment[1]
            user_body = user_comment['body']

            # Get parent id of the comment
            parent_id = user_comment['parent_id']

            # Get parent comment
            parent_comment = comments[comments['id'] == parent_id]

            # If the parent comment is not found, skip
            if parent_comment.empty:
                continue

            # Get parent comment body
            parent_body = parent_comment['body'].values[0]

            # Get author of the parent comment
            parent_author = parent_comment['author'].values[0]

            # Get author credibility'
            parent_author_credibility = author_credibility[author_credibility['author'] == parent_author]['credibility'].values[0]

            # Calculate similarity value and store in array
            cred_sim = get_cred_sim(parent_author_credibility, user_body, parent_body)
            cred_sims.append(cred_sim)

        # Check for empty array
        if len(cred_sims) == 0:
            continue

        # Calculate average similarity value
        avg_cred_sim = sum(cred_sims) / len(cred_sims)

        # Add to df
        new_row = {'author': parent_author, 'similarity': avg_cred_sim}
        author_cred_sims.loc[len(author_cred_sims)] = new_row
        
    return author_cred_sims

In [78]:
cred_sims = create_cred_sims()

  0%|          | 4/6278 [00:19<8:38:07,  4.95s/it]


KeyboardInterrupt: 

In [None]:
Add betweeness centrality, clustering coefficient, degree, and credibility as node attributes
betweenness = {i: float(b) for i, b in nx.betweenness_centrality(graph).items()}
nx.set_node_attributes(graph, betweenness, 'betweenness')

clustering = {i: float(c) for i, c in nx.clustering(graph).items()}
nx.set_node_attributes(graph, clustering, 'clustering')

degree = dict(nx.degree(graph))
nx.set_node_attributes(graph, degree, 'degree')

credibility = {i: int(credibility > 0.5) for i, credibility in enumerate(credibilities)}
nx.set_node_attributes(graph, credibility, 'credibility')

nx.write_gexf(graph, 'data/reddit.gexf')