In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()

In [2]:
import pandas as pd

def clean_text(text_series):
    """
    Cleans text data by removing URLs and HTML entities.

    Parameters:
    text_series (Series): Pandas Series containing text data.

    Returns:
    Series: Cleaned text data.
    """
    # Remove URLs from text
    text_series = text_series.str.replace(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)
    # Remove HTML entities like '&gt;'
    text_series = text_series.str.replace('&gt;', '')
    return text_series

def add_datetime_columns(df, time_column='created_utc'):
    """
    Adds 'Y' (year) and 'YM' (year-month) columns to the DataFrame based on a timestamp column.

    Parameters:
    df (DataFrame): The DataFrame to modify.
    time_column (str): Name of the timestamp column.

    Returns:
    DataFrame: The modified DataFrame with added 'Y' and 'YM' columns.
    """
    # Convert timestamp to datetime and extract the year
    df['Y'] = pd.to_datetime(df[time_column], unit='s').dt.year
    # Extract year-month in 'YYYY-MM' format
    df['YM'] = pd.to_datetime(df[time_column], unit='s').dt.strftime('%Y-%m')
    return df

def submission_comment_process(subreddit, min_num_comments=3, min_score=-1, years=[2016], chunk_size=10**6):
    """
    Processes submissions and comments for a given subreddit and returns cleaned DataFrames.

    Parameters:
    subreddit (str): Name of the subreddit.
    min_num_comments (int): Minimum number of comments required for a submission.
    min_score (int): Minimum score required for a submission.
    years (list): List of years to include in the data.
    chunk_size (int): Number of rows per chunk when reading comments CSV.

    Returns:
    DataFrame, DataFrame: Cleaned submissions and comments DataFrames.
    """
    # Read submissions CSV file
    df_s = pd.read_csv(f'data/{subreddit}_submissions.csv')
    # Alternative path (commented out)
    # df_s = pd.read_csv('drive/MyDrive/Reddit/path in g drive')

    # Add 'Y' and 'YM' columns to submissions DataFrame
    df_s = add_datetime_columns(df_s, 'created_utc')
    # Filter submissions by specified years
    df_s = df_s[df_s['Y'].isin(years)]
    print(f"{subreddit}: # submissions {len(df_s)}")

    # Filter submissions based on minimum score and number of comments
    df_s = df_s[df_s['score'] > min_score]
    df_s = df_s[df_s['num_comments'] >= min_num_comments]
    # Add subreddit name to DataFrame
    df_s['sub'] = subreddit
    print(f"{subreddit}: # submissions after cleaning {len(df_s)}")
    print('Submissions processing done.')

    # Start processing comments
    print('Starting processing comments')
    # Read comments CSV file in chunks
    comments_chunks = pd.read_csv(
        f'data/{subreddit}_comments.csv', chunksize=chunk_size)
    # Alternative path (commented out)
    # comments_chunks = pd.read_csv('drive/MyDrive/Reddit/path in g drive', chunksize=chunk_size)

    df_c = pd.DataFrame()  # Initialize empty DataFrame for comments

    for i, chunk in enumerate(comments_chunks):
        # Remove 't3_' prefix from 'link_id'
        chunk['link_id'] = chunk['link_id'].str.replace('t3_', '')
        # Keep only comments linked to the filtered submissions
        chunk = chunk[chunk['link_id'].isin(df_s['id'].unique())]
        print(f"Chunk {i}: {len(chunk)} comments after filtering")

        # The following lines are commented out; they can be used for additional filtering
        # Remove comments with missing 'body' (commented out)
        # chunk = chunk[~chunk['body'].isna()]
        # Remove comments where 'author' is '[deleted]' (commented out)
        # chunk = chunk[chunk['author'] != '[deleted]']
        # Remove comments where 'body' is '[removed]' (commented out)
        # chunk = chunk[chunk['body'] != '[removed]']

        # Remove prefix from 'parent_id'
        chunk['parent_id'] = chunk['parent_id'].str[3:]
        # Keep only comments with at least 3 words in 'body' (commented out)
        # chunk = chunk[chunk['body'].str.split().str.len() >= 3]

        # Add 'Y' and 'YM' columns to comments DataFrame
        chunk = add_datetime_columns(chunk, 'created_utc')

        # Break the loop if the chunk's years are beyond the specified range
        if chunk['Y'].min() > max(years):
            break

        # Clean the 'body' text in comments
        chunk['body'] = clean_text(chunk['body'])
        # Add subreddit name to DataFrame
        chunk['sub'] = subreddit

        # Concatenate the processed chunk to the main comments DataFrame
        df_c = pd.concat([df_c, chunk], ignore_index=True)

    print(f"{subreddit}: # comments {len(df_c)}")

    # Remove 't3_' prefix from 'link_id' in comments (redundant but kept for consistency)
    df_c['link_id'] = df_c['link_id'].str.replace('t3_', '')
    # Keep only comments linked to the filtered submissions
    df_c = df_c[df_c['link_id'].isin(df_s['id'].unique())]

    # The following lines are commented out; they can be used for additional filtering
    # Remove comments where 'author' is '[deleted]' (commented out)
    # df_c = df_c[df_c['author'] != '[deleted]']
    # Remove comments where 'body' is '[deleted]' or '[removed]' (commented out)
    # df_c = df_c[~df_c['body'].isin(['[deleted]', '[removed]'])]
    # Remove comments with missing 'body' (commented out)
    # df_c = df_c[~df_c['body'].isna()]
    # Keep only comments with at least 3 words in 'body' (commented out)
    # df_c = df_c[df_c['body'].str.split().str.len() >= 3]

    # The following line is commented out; it might be for adding a 'Cred' column based on domain
    # df_c['B'] = df_c['domain'].map(df_crd.set_index("Domain")["Cred"])

    print(f"{subreddit}: # comments after cleaning {len(df_c)}")
    print('Finished processing comments')
    return df_s, df_c

def P2P_Multi(Subreddits, min_num_comments=0, min_score=-1, years=[2017]):
    """
    Processes multiple subreddits and aggregates the submissions and comments.

    Parameters:
    Subreddits (list): List of subreddit names.
    min_num_comments (int): Minimum number of comments required for a submission.
    min_score (int): Minimum score required for a submission.
    years (list): List of years to include in the data.

    Returns:
    DataFrame, DataFrame: Aggregated submissions and comments DataFrames.
    """
    Df_S = pd.DataFrame()  # Initialize empty DataFrame for submissions
    Df_C = pd.DataFrame()  # Initialize empty DataFrame for comments

    for subreddit in Subreddits:
        # Process each subreddit and get cleaned DataFrames
        df_s, df_c = submission_comment_process(
            subreddit, min_num_comments, min_score, years)
        # Concatenate the results to the aggregated DataFrames
        Df_S = pd.concat([Df_S, df_s], ignore_index=True)
        Df_C = pd.concat([Df_C, df_c], ignore_index=True)

    return Df_S, Df_C

# List of subreddits to process
Subreddits = ["business"]

# Process the subreddits and get aggregated submissions and comments
DS, DC = P2P_Multi(Subreddits, min_num_comments=0, min_score=-1, years=range(2016, 2017))

# Filter submissions to only include those with comments in DC
DS = DS[DS['sub'].isin(DC['sub'].unique())]

print('Overall Submissions:', len(DS))
print('Overall Comments:', len(DC))


business: # submissions 265670
business: # submissions after cleaning 265670
Submissions processing done.
Starting processing comments
Chunk 0: 7913 comments after filtering
Chunk 1: 103515 comments after filtering
business: # comments 111428
business: # comments after cleaning 111428
Finished processing comments
Overall Submissions: 265670
Overall Comments: 111428


In [4]:
DC.nunique()

Index          111428
id             111428
author          20662
created_utc    111152
parent_id       69699
link_id         34993
body            86125
Y                   4
YM                 29
sub                 1
dtype: int64

In [None]:
# sample submission data
DS.head(10)

In [None]:
# sample comment data
DC.head(10)

#Networkx

In [5]:
import networkx as nx
import numpy as np
f=pd.crosstab(DC['link_id'],DC['author'])
A=np.array((f.values).astype(float))
E=A@A.T
np.fill_diagonal(E,0)
print(E)
min_num_shared_user=2
Adj=np.array(E>min_num_shared_user)
# G=nx.from_numpy_array(Adj)
# print(# Graph Nodes:’, len(G.nodes))
# for v in nx.nodes(G):
    # G.nodes[v]["sub"] = DS_b.iloc[v]["sub"]
# nx.write_gexf(G, 'business.gexf')

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [22]:
subset = Adj[:1000, :1000]
G=nx.from_numpy_array(subset)
nx.write_gexf(G, 'data/business.gexf')

In [17]:
subset.shape

(1000, 1000)

In [20]:
G.number_of_nodes()

1000

In [18]:
G.size()

11499

In [11]:
subset.shape
# G=nx.from_numpy_array(Adj)

(1000, 1000)

In [None]:
ds=DS[DS['id']==sbm]
dc=DC[DC['link_id']==sbm]


In [None]:
import pandas as pd


data = DC[DC['link_id']==sbm]

# Filter necessary columns
comments = data[['id', 'parent_id', 'body']]

# Initialize the comment tree with the root link_id
link_id = sbm  # Assuming '5j5otc' is the link_id for root comments
comment_tree = {link_id: []}

# Function to add comment to the tree
def add_to_tree(comment_id, parent_id, comment_body):
    if parent_id == link_id:  # Handling root comments
        if parent_id not in comment_tree:
            comment_tree[parent_id] = []
        comment_tree[parent_id].append({'id': comment_id, 'body': comment_body, 'replies': []})
    else:
        # Recursively find the parent and add the comment as a reply
        find_and_add(parent_id, comment_tree[link_id], {'id': comment_id, 'body': comment_body, 'replies': []})

# Recursive function to find the parent in the tree and add the current comment
def find_and_add(parent_id, current_level, comment):
    for entry in current_level:
        if entry['id'] == parent_id:
            entry['replies'].append(comment)
            return True
        if find_and_add(parent_id, entry['replies'], comment):
            return True
    return False

# Build the tree by iterating over each comment
for _, row in comments.iterrows():
    add_to_tree(row['id'], row['parent_id'], row['body'])

# Optional: Function to print the tree in a structured manner for better readability
def print_tree(level, indent=0):
    for node in level:
        print(' ' * indent + f"Comment ID: {node['id']}, Body: {node['body'][:60]}...")  # Print truncated body
        if node['replies']:
            print_tree(node['replies'], indent + 4)

# Example usage: Print the first few levels of the tree
print_tree(comment_tree[link_id][:2])  # Adjust as needed for full tree or specific sections


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Assuming DC and sbm are defined somewhere earlier in your code
data = DC[DC['link_id'] == sbm]
fig, ax = plt.subplots(figsize=(10, 10))

# Filter necessary columns
comments = data[['id', 'parent_id', 'body']]

# Create a directed graph
G = nx.DiGraph()

# Define a root node for visualization and add nodes and edges
root_node = 'Root'
G.add_node(root_node, body="Root Post")
for _, row in comments.iterrows():
    G.add_node(row['id'], body=row['id'])  # Use full ID for display
    parent_id = row['parent_id'] if not row['parent_id'].startswith(sbm) else root_node
    G.add_edge(parent_id, row['id'])

# Compute depth for each node for shell assignment
depth = nx.single_source_shortest_path_length(G, root_node)
# Create shell layout based on depths
max_depth = max(depth.values())
shells = [[] for _ in range(max_depth + 1)]
for node, d in depth.items():
    shells[d].append(node)

# Define node colors based on depth with a more vibrant color map
node_color = [depth[node] / max_depth for node in G.nodes()]

if 1:
  # Draw the graph using the shell layout
  pos = nx.shell_layout(G, shells)
else:
  pos = nx.spring_layout(G, k=0.1, iterations=50)  # Adjust k and iterations for better layout

nx.draw_networkx_nodes(G, pos, node_color=node_color, node_size=100, cmap=plt.cm.viridis, alpha=0.8)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(G, pos, labels={n: G.nodes[n]['body'] for n in G.nodes}, font_size=8)

plt.title("Enhanced Comment Tree Visualization")
plt.axis('off')  # Hide axes
plt.show()
