### Reddit Data Cleaning and Wrangling

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import json
import pprint

#### Posts

In [9]:
# Step 1: read json file, store as list of dicts
posts_lst = []
with open('reddit_post_data.json', 'r') as file:
    for line in file:
        posts_lst.append(json.loads(line))

We should observe the structure of the dictionary to understand what features we want to store in our dataframe and how:

In [10]:
# Step 2: Check and set keys of necessary information
pprint.pprint(posts_lst[0])

{'all_awardings': [],
 'allow_live_comments': False,
 'approved_at_utc': None,
 'approved_by': None,
 'archived': False,
 'author': 'Active_Appeal_2673',
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_1hzvzxepdw',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'banned_at_utc': None,
 'banned_by': None,
 'can_gild': False,
 'can_mod_post': False,
 'category': None,
 'clicked': False,
 'comment_limit': 2048,
 'comment_sort': 'confidence',
 'content_categories': None,
 'contest_mode': False,
 'created': 1737951266.0,
 'created_utc': 1737951266.0,
 'discussion_type': None,
 'distinguished': None,
 'domain': 'reddit.com',
 'downs': 0,
 'edited': False,
 'gallery_data': {'items': [{'id': 593729473, 'media_id': 'mhe95ikepgfe1'},
   

We define the keys to extract for dataframe:

In [11]:
keys_to_extract = ['author', 'author_fullname', 'created_utc', 'permalink',
                   'name', 'link_id', 'subreddit_id', 'subreddit', 
                   'num_crossposts', 'num_comments', 'score', 'selftext', 'title', 'ups', 'upvote_ratio']

We extract the necessary information and store the data in one large dataframe:

In [12]:
# Step 3: Store Info in Dataframe
extracted_data = [{k: d[k] for k in keys_to_extract if k in d} for d in posts_lst]
posts_df = pd.DataFrame(extracted_data)
posts_df.shape

(163, 14)

In [13]:
posts_df[:5]

Unnamed: 0,author,author_fullname,created_utc,permalink,name,subreddit_id,subreddit,num_crossposts,num_comments,score,selftext,title,ups,upvote_ratio
0,Active_Appeal_2673,t2_1hzvzxepdw,1737951000.0,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t3_1iazjin,t5_4uoy2u,AmIOverreacting,6,4983,59203,"Throwaway account, my main has too much person...",AIO - My “friend” wants to kick me out after I...,59203,0.92
1,Bitter-Gur-4613,t2_azbuqzix,1736800000.0,/r/clevercomebacks/comments/1i0nukq/were_at_th...,t3_1i0nukq,t5_dyqlw,clevercomebacks,3,2597,49224,,"We're at the ""It isn't happening yet!"" stage o...",49224,0.92
2,Eadkrakka,t2_k1glcfx,1736747000.0,/r/clevercomebacks/comments/1i0788c/i_mean_hes...,t3_1i0788c,t5_dyqlw,clevercomebacks,5,1117,48630,,"I mean, he's not wrong.",48630,0.94
3,Diligent_Night602,t2_1drxg2jm4b,1736864000.0,/r/popculturechat/comments/1i16xr6/it_is_being...,t3_1i16xr6,t5_5rj68d,popculturechat,3,630,31670,,It Is Being Pointed Out That Kim And Khloé Kar...,31670,0.97
4,t1mdawg,t2_4d2nq,1736364000.0,/r/AdviceAnimals/comments/1hwsk85/the_la_wildf...,t3_1hwsk85,t5_2s7tt,AdviceAnimals,1,1246,29323,,The LA wildfires,29323,0.89


Now that we have loaded the json file and stored the values we need in dataframe, we want to clean our data. To do that, we start by observing the data and gathering information. 

In [14]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   author           163 non-null    object 
 1   author_fullname  163 non-null    object 
 2   created_utc      163 non-null    float64
 3   permalink        163 non-null    object 
 4   name             163 non-null    object 
 5   subreddit_id     163 non-null    object 
 6   subreddit        163 non-null    object 
 7   num_crossposts   163 non-null    int64  
 8   num_comments     163 non-null    int64  
 9   score            163 non-null    int64  
 10  selftext         163 non-null    object 
 11  title            163 non-null    object 
 12  ups              163 non-null    int64  
 13  upvote_ratio     163 non-null    float64
dtypes: float64(2), int64(4), object(8)
memory usage: 18.0+ KB


In [15]:
posts_df.isna().any()

author             False
author_fullname    False
created_utc        False
permalink          False
name               False
subreddit_id       False
subreddit          False
num_crossposts     False
num_comments       False
score              False
selftext           False
title              False
ups                False
upvote_ratio       False
dtype: bool

From our exploration above, we see that there is NaN or Null values in only one column in our dataframe. The column is author_fullname, which is the unique id for each account starting with "t2_". While we also have column author which according to Reddit's official document should be unique to each user, we do not want to take risks. Hence, we decide to drop the rows with NaN/Null values inplace:

In [16]:
# Step 4: Remove rows containing NAs from dataframe 
# posts_df.dropna(how = 'any', inplace=True)

In [17]:
posts_df.shape

(163, 14)

Now, we have a dataframe with no NaN/Null values in any columns, let's check by printing out the info:

In [18]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   author           163 non-null    object 
 1   author_fullname  163 non-null    object 
 2   created_utc      163 non-null    float64
 3   permalink        163 non-null    object 
 4   name             163 non-null    object 
 5   subreddit_id     163 non-null    object 
 6   subreddit        163 non-null    object 
 7   num_crossposts   163 non-null    int64  
 8   num_comments     163 non-null    int64  
 9   score            163 non-null    int64  
 10  selftext         163 non-null    object 
 11  title            163 non-null    object 
 12  ups              163 non-null    int64  
 13  upvote_ratio     163 non-null    float64
dtypes: float64(2), int64(4), object(8)
memory usage: 18.0+ KB


We can see that a lot of the text information's dtype is object. We want to convert them to string. Luckily, Pandas has a function that automatically detects the most suitable dtype and convert all values in dataframe to their most suitable dtype.

In [19]:
# Step 5: Convert dtype: object to string
posts_df = posts_df.convert_dtypes()
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   author           163 non-null    string 
 1   author_fullname  163 non-null    string 
 2   created_utc      163 non-null    Int64  
 3   permalink        163 non-null    string 
 4   name             163 non-null    string 
 5   subreddit_id     163 non-null    string 
 6   subreddit        163 non-null    string 
 7   num_crossposts   163 non-null    Int64  
 8   num_comments     163 non-null    Int64  
 9   score            163 non-null    Int64  
 10  selftext         163 non-null    string 
 11  title            163 non-null    string 
 12  ups              163 non-null    Int64  
 13  upvote_ratio     163 non-null    Float64
dtypes: Float64(1), Int64(5), string(8)
memory usage: 18.9 KB


The objects are turned to strings. 

We also want to process our created_utc column. However, we are not sure how reddit codes the datetime attribute, let's test it with the first value of created_utc:

In [20]:
# Step 6: Convert dtype: float to datetime
test = posts_df['created_utc'][0]
pd.to_datetime(test, unit='s')

Timestamp('2025-01-27 04:14:26')

Bingo! This conversion result is reasonable and matches with the information shown if we directly visit the page. Let's convert all values in created_utc column.

In [21]:
posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc'], unit = 's')
posts_df[:5]

Unnamed: 0,author,author_fullname,created_utc,permalink,name,subreddit_id,subreddit,num_crossposts,num_comments,score,selftext,title,ups,upvote_ratio
0,Active_Appeal_2673,t2_1hzvzxepdw,2025-01-27 04:14:26,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t3_1iazjin,t5_4uoy2u,AmIOverreacting,6,4983,59203,"Throwaway account, my main has too much person...",AIO - My “friend” wants to kick me out after I...,59203,0.92
1,Bitter-Gur-4613,t2_azbuqzix,2025-01-13 20:33:23,/r/clevercomebacks/comments/1i0nukq/were_at_th...,t3_1i0nukq,t5_dyqlw,clevercomebacks,3,2597,49224,,"We're at the ""It isn't happening yet!"" stage o...",49224,0.92
2,Eadkrakka,t2_k1glcfx,2025-01-13 05:43:10,/r/clevercomebacks/comments/1i0788c/i_mean_hes...,t3_1i0788c,t5_dyqlw,clevercomebacks,5,1117,48630,,"I mean, he's not wrong.",48630,0.94
3,Diligent_Night602,t2_1drxg2jm4b,2025-01-14 14:13:25,/r/popculturechat/comments/1i16xr6/it_is_being...,t3_1i16xr6,t5_5rj68d,popculturechat,3,630,31670,,It Is Being Pointed Out That Kim And Khloé Kar...,31670,0.97
4,t1mdawg,t2_4d2nq,2025-01-08 19:26:49,/r/AdviceAnimals/comments/1hwsk85/the_la_wildf...,t3_1hwsk85,t5_2s7tt,AdviceAnimals,1,1246,29323,,The LA wildfires,29323,0.89


We can also modify the permalink column to complete the fragmented links:

In [22]:
# Step 7: Modifying values
posts_df['permalink'] = 'https://old.reddit.com' + posts_df['permalink']

Now, we have a cleaned dataframe ready for analysis. 

In [23]:
posts_df[:10]

Unnamed: 0,author,author_fullname,created_utc,permalink,name,subreddit_id,subreddit,num_crossposts,num_comments,score,selftext,title,ups,upvote_ratio
0,Active_Appeal_2673,t2_1hzvzxepdw,2025-01-27 04:14:26,https://old.reddit.com/r/AmIOverreacting/comme...,t3_1iazjin,t5_4uoy2u,AmIOverreacting,6,4983,59203,"Throwaway account, my main has too much person...",AIO - My “friend” wants to kick me out after I...,59203,0.92
1,Bitter-Gur-4613,t2_azbuqzix,2025-01-13 20:33:23,https://old.reddit.com/r/clevercomebacks/comme...,t3_1i0nukq,t5_dyqlw,clevercomebacks,3,2597,49224,,"We're at the ""It isn't happening yet!"" stage o...",49224,0.92
2,Eadkrakka,t2_k1glcfx,2025-01-13 05:43:10,https://old.reddit.com/r/clevercomebacks/comme...,t3_1i0788c,t5_dyqlw,clevercomebacks,5,1117,48630,,"I mean, he's not wrong.",48630,0.94
3,Diligent_Night602,t2_1drxg2jm4b,2025-01-14 14:13:25,https://old.reddit.com/r/popculturechat/commen...,t3_1i16xr6,t5_5rj68d,popculturechat,3,630,31670,,It Is Being Pointed Out That Kim And Khloé Kar...,31670,0.97
4,t1mdawg,t2_4d2nq,2025-01-08 19:26:49,https://old.reddit.com/r/AdviceAnimals/comment...,t3_1hwsk85,t5_2s7tt,AdviceAnimals,1,1246,29323,,The LA wildfires,29323,0.89
5,Elegant_Noise1116,t2_r4zx1zd1,2025-01-13 18:08:49,https://old.reddit.com/r/nextfuckinglevel/comm...,t3_1i0kavt,t5_m0bnr,nextfuckinglevel,14,1003,29286,Khalsa Aid volunteers provided water and suppl...,Sikh community providing supplies to those aff...,29286,0.89
6,TheExpressUS,t2_10wjaq62j0,2025-01-10 01:20:31,https://old.reddit.com/r/antiwork/comments/1hx...,t3_1hxstm9,t5_2y77d,antiwork,2,697,24651,,Airbnb denies refund as LA wildfires not a 'ma...,24651,0.97
7,fredgoeswest,t2_1epclhfffo,2025-01-10 17:37:13,https://old.reddit.com/r/Chihuahua/comments/1h...,t3_1hy9y05,t5_2r2ct,Chihuahua,1,538,17986,"3 months ago I flew from Vancouver, Canada to ...",Celebrating 3 months since adopting Fred,17986,0.98
8,Lumpy_Ad8864,t2_1h42onjnug,2025-01-17 09:38:33,https://old.reddit.com/r/RealTwitterAccounts/c...,t3_1i3cse3,t5_7cxc0i,RealTwitterAccounts,3,280,17617,,"The Sierra Club, fighting back with facts afte...",17617,1.0
9,20_mile,t2_12krc8w6dq,2025-01-11 13:46:49,https://old.reddit.com/r/news/comments/1hyw7g6...,t3_1hyw7g6,t5_2qh3l,news,5,1210,16545,,‘Essential’: nearly 800 incarcerated firefight...,16545,0.96


#### Comments

In [24]:
# Step 1: read json file, store as list of dicts
comments_lst = []
with open('reddit_comment_data.json', 'r') as file:
    for line in file:
        comments_lst.append(json.loads(line))

We should observe the structure of the dictionary to understand what features we want to store in our dataframe and how:

In [25]:
# Step 2: Check and set keys of necessary information
pprint.pprint(comments_lst[0])

{'all_awardings': [],
 'approved_at_utc': None,
 'approved_by': None,
 'archived': False,
 'associated_award': None,
 'author': 'TangledUpPuppeteer',
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_mm7m0ccve',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'banned_at_utc': None,
 'banned_by': None,
 'body': '“You’re a worthless, spineless piece of shit, deadass.”\n'
         '\n'
         'If he ever crawls out of the sludge he currently resides in, remind '
         'him he can go “car camping haha”\n'
         '\n'
         'Put this shit on blast to every mutual you have.',
 'can_gild': False,
 'can_mod_post': False,
 'collapsed': False,
 'collapsed_because_crowd_control': None,
 'collapsed_reason': None,
 'collapsed_reason

We define the keys to extract for dataframe:

In [26]:
keys_to_extract = ['author', 'author_fullname', 'created_utc', 'permalink',
                   'name', 'link_id', 'parent_id', 'subreddit_id', 'subreddit', 
                   'num_crossposts', 'num_comments', 'score', 'selftext', 'title', 'ups', 'upvote_ratio']

We extract the necessary information and store the data in one large dataframe:

In [27]:
# Step 3: Store Info in Dataframe
extracted_data = [{k: d[k] for k in keys_to_extract if k in d} for d in comments_lst]
comments_df = pd.DataFrame(extracted_data)
comments_df.shape

(44106, 16)

In [28]:
comments_df[:5]

Unnamed: 0,author,author_fullname,created_utc,permalink,name,link_id,parent_id,subreddit_id,subreddit,num_crossposts,num_comments,score,selftext,title,ups,upvote_ratio
0,TangledUpPuppeteer,t2_mm7m0ccve,1737952000.0,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eg33m,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,5,5,2964,"“You’re a worthless, spineless piece of shit, ...",,2964,1.0
1,Little_Loki918,t2_oxub5tnr,1737952000.0,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eevxe,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,46,46,9201,NOR. He should be dead to you. And i would let...,,9201,1.0
2,ORANGENBLACK101214,t2_1kbebmiu,1737952000.0,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eg3vf,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,13,13,4514,Wait. Does he think insurance is going to just...,,4514,1.0
3,CorruptingTheSystem,t2_9ourbvb,1737952000.0,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9efs9c,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,38,38,13501,“ I already packed your stuff”\n\nSheesh.,,13501,1.0
4,Dry-Newspaper-8311,t2_u67rd0zep,1737952000.0,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eey0v,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,12,12,8537,NOR what a piece of shit. \nCut him out of you...,,8537,1.0


Now that we have loaded the json file and stored the values we need in dataframe, we want to clean our data. To do that, we start by observing the data and gathering information. 

In [29]:
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44106 entries, 0 to 44105
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   author           44106 non-null  object 
 1   author_fullname  42295 non-null  object 
 2   created_utc      44106 non-null  float64
 3   permalink        44106 non-null  object 
 4   name             44106 non-null  object 
 5   link_id          44106 non-null  object 
 6   parent_id        44106 non-null  object 
 7   subreddit_id     44106 non-null  object 
 8   subreddit        44106 non-null  object 
 9   num_crossposts   44106 non-null  int64  
 10  num_comments     44106 non-null  int64  
 11  score            44106 non-null  int64  
 12  selftext         44106 non-null  object 
 13  title            44106 non-null  object 
 14  ups              44106 non-null  int64  
 15  upvote_ratio     44106 non-null  float64
dtypes: float64(2), int64(4), object(10)
memory usage: 5.4+ MB


In [30]:
comments_df.isna().any()

author             False
author_fullname     True
created_utc        False
permalink          False
name               False
link_id            False
parent_id          False
subreddit_id       False
subreddit          False
num_crossposts     False
num_comments       False
score              False
selftext           False
title              False
ups                False
upvote_ratio       False
dtype: bool

From our exploration above, we see that there is NaN or Null values in only one column in our dataframe. The column is author_fullname, which is the unique id for each account starting with "t2_". While we also have column author which according to Reddit's official document should be unique to each user, we do not want to take risks. Hence, we decide to drop the rows with NaN/Null values inplace:

In [31]:
# Step 4: Remove rows containing NAs from dataframe 
# comments_df.dropna(how = 'any', inplace=True)

In [32]:
comments_df.shape

(44106, 16)

Now, we have a dataframe with no NaN/Null values in any columns, let's check by printing out the info:

In [33]:
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44106 entries, 0 to 44105
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   author           44106 non-null  object 
 1   author_fullname  42295 non-null  object 
 2   created_utc      44106 non-null  float64
 3   permalink        44106 non-null  object 
 4   name             44106 non-null  object 
 5   link_id          44106 non-null  object 
 6   parent_id        44106 non-null  object 
 7   subreddit_id     44106 non-null  object 
 8   subreddit        44106 non-null  object 
 9   num_crossposts   44106 non-null  int64  
 10  num_comments     44106 non-null  int64  
 11  score            44106 non-null  int64  
 12  selftext         44106 non-null  object 
 13  title            44106 non-null  object 
 14  ups              44106 non-null  int64  
 15  upvote_ratio     44106 non-null  float64
dtypes: float64(2), int64(4), object(10)
memory usage: 5.4+ MB


We can see that a lot of the text information's dtype is object. We want to convert them to string. Luckily, Pandas has a function that automatically detects the most suitable dtype and convert all values in dataframe to their most suitable dtype.

In [34]:
# Step 5: Convert dtype: object to string
comments_df = comments_df.convert_dtypes()
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44106 entries, 0 to 44105
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   author           44106 non-null  string 
 1   author_fullname  42295 non-null  string 
 2   created_utc      44106 non-null  Int64  
 3   permalink        44106 non-null  string 
 4   name             44106 non-null  string 
 5   link_id          44106 non-null  string 
 6   parent_id        44106 non-null  string 
 7   subreddit_id     44106 non-null  string 
 8   subreddit        44106 non-null  string 
 9   num_crossposts   44106 non-null  Int64  
 10  num_comments     44106 non-null  Int64  
 11  score            44106 non-null  Int64  
 12  selftext         44106 non-null  string 
 13  title            44106 non-null  string 
 14  ups              44106 non-null  Int64  
 15  upvote_ratio     44106 non-null  Float64
dtypes: Float64(1), Int64(5), string(10)
memory usage: 5.6 MB


The objects are turned to strings. 

We also want to process our created_utc column. However, we are not sure how reddit codes the datetime attribute, let's test it with the first value of created_utc:

In [35]:
# Step 6: Convert dtype: float to datetime
test = comments_df['created_utc'][0]
pd.to_datetime(test, unit='s')

Timestamp('2025-01-27 04:31:04')

Bingo! This conversion result is reasonable and matches with the information shown if we directly visit the page. Let's convert all values in created_utc column.

In [36]:
comments_df['created_utc'] = pd.to_datetime(comments_df['created_utc'], unit = 's')
comments_df[:5]

Unnamed: 0,author,author_fullname,created_utc,permalink,name,link_id,parent_id,subreddit_id,subreddit,num_crossposts,num_comments,score,selftext,title,ups,upvote_ratio
0,TangledUpPuppeteer,t2_mm7m0ccve,2025-01-27 04:31:04,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eg33m,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,5,5,2964,"“You’re a worthless, spineless piece of shit, ...",,2964,1.0
1,Little_Loki918,t2_oxub5tnr,2025-01-27 04:23:02,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eevxe,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,46,46,9201,NOR. He should be dead to you. And i would let...,,9201,1.0
2,ORANGENBLACK101214,t2_1kbebmiu,2025-01-27 04:31:13,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eg3vf,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,13,13,4514,Wait. Does he think insurance is going to just...,,4514,1.0
3,CorruptingTheSystem,t2_9ourbvb,2025-01-27 04:29:01,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9efs9c,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,38,38,13501,“ I already packed your stuff” Sheesh.,,13501,1.0
4,Dry-Newspaper-8311,t2_u67rd0zep,2025-01-27 04:23:25,/r/AmIOverreacting/comments/1iazjin/aio_my_fri...,t1_m9eey0v,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,12,12,8537,NOR what a piece of shit. Cut him out of your...,,8537,1.0


We can also modify the permalink column to complete the fragmented links:

In [37]:
# Step 7: Modifying values
comments_df['permalink'] = 'https://old.reddit.com' + comments_df['permalink']

Now, we have a cleaned dataframe ready for analysis. 

In [38]:
comments_df[:10]

Unnamed: 0,author,author_fullname,created_utc,permalink,name,link_id,parent_id,subreddit_id,subreddit,num_crossposts,num_comments,score,selftext,title,ups,upvote_ratio
0,TangledUpPuppeteer,t2_mm7m0ccve,2025-01-27 04:31:04,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9eg33m,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,5,5,2964,"“You’re a worthless, spineless piece of shit, ...",,2964,1.0
1,Little_Loki918,t2_oxub5tnr,2025-01-27 04:23:02,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9eevxe,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,46,46,9201,NOR. He should be dead to you. And i would let...,,9201,1.0
2,ORANGENBLACK101214,t2_1kbebmiu,2025-01-27 04:31:13,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9eg3vf,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,13,13,4514,Wait. Does he think insurance is going to just...,,4514,1.0
3,CorruptingTheSystem,t2_9ourbvb,2025-01-27 04:29:01,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9efs9c,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,38,38,13501,“ I already packed your stuff” Sheesh.,,13501,1.0
4,Dry-Newspaper-8311,t2_u67rd0zep,2025-01-27 04:23:25,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9eey0v,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,12,12,8537,NOR what a piece of shit. Cut him out of your...,,8537,1.0
5,jazzeriah,t2_29iwtb62,2025-01-27 04:29:17,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9eftpb,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,13,13,5245,He’s a fucking piece of shit dude.,,5245,1.0
6,Junimo116,t2_1cdvprap1d,2025-01-27 04:23:51,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9ef0gr,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,3,3,3577,This isn't a friend. This is someone who only ...,,3577,1.0
7,Willing-Piglet8899,t2_v9d2cnx4,2025-01-27 04:17:46,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9ee3gl,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,14,14,11354,"""car camping haha"" that is infuriating",,11354,1.0
8,Glittering_Raise_710,t2_nhku54oy5,2025-01-27 04:20:50,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9eek1n,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,11,11,2723,NOR. The way he even brought it up is pure shi...,,2723,1.0
9,Stacker2_Motorsports,t2_tfejom8gc,2025-01-27 04:25:42,https://old.reddit.com/r/AmIOverreacting/comme...,t1_m9efakc,t3_1iazjin,t3_1iazjin,t5_4uoy2u,AmIOverreacting,0,0,493,"That is NO friend, fuck that dude. Piece of sh...",,493,1.0


### Save Cleaned Posts and Comments Data to JSON

In [39]:
posts_df.to_json('reddit_posts_cleaned.json', orient='records')

In [40]:
comments_df.to_json('reddit_comments_cleaned.json', orient='records')

Notice we did not remove NA. This is because if NAs are removed, some comments cannot find a matching parent post/comment.

### Create User dict with parent User list

In [41]:
# Extract relevant information from comments_df
comment_replies = {}

# Remove NA values here
comments_df.dropna(how = 'any', inplace=True)
posts_df.dropna(how = 'any', inplace=True)

# Convert posts and comments into lookup dictionaries
post_authors = posts_df.set_index('name')['author_fullname'].to_dict()
comment_authors = comments_df.set_index('name')['author_fullname'].to_dict()

for _, row in comments_df.iterrows():
    user = row['author_fullname']
    parent_id = row['parent_id']

    if parent_id.startswith("t3_"):  # Replying to a post
        parent_user = post_authors.get(parent_id, None)
    elif parent_id.startswith("t1_"):  # Replying to another comment
        parent_user = comment_authors.get(parent_id, None)
    else:
        parent_user = None

    if parent_user:
        if user not in comment_replies:
            comment_replies[user] = []
        comment_replies[user].append(parent_user)

# Convert sets to lists for better usability
comment_replies = {user: list(replied_to) for user, replied_to in comment_replies.items()}

print(comment_replies)


{'t2_mm7m0ccve': ['t2_1hzvzxepdw', 't2_et93i6rsc', 't2_5pdivxkq', 't2_et93i6rsc'], 't2_oxub5tnr': ['t2_1hzvzxepdw'], 't2_1kbebmiu': ['t2_1hzvzxepdw'], 't2_9ourbvb': ['t2_1hzvzxepdw', 't2_1hzvzxepdw'], 't2_u67rd0zep': ['t2_1hzvzxepdw'], 't2_29iwtb62': ['t2_1hzvzxepdw', 't2_1hzvzxepdw', 't2_1i7q15ag'], 't2_1cdvprap1d': ['t2_1hzvzxepdw'], 't2_v9d2cnx4': ['t2_1hzvzxepdw'], 't2_nhku54oy5': ['t2_1hzvzxepdw', 't2_v9d2cnx4', 't2_gag70dr8', 't2_5t9cgirlu', 't2_64hhu', 't2_18j8t2obvw', 't2_16t9fr', 't2_5t9cgirlu', 't2_cwavx', 't2_18j8t2obvw', 't2_x2ep78qp6'], 't2_tfejom8gc': ['t2_1hzvzxepdw', 't2_3iob4q00'], 't2_42gedjvv': ['t2_1hzvzxepdw'], 't2_4140kyai': ['t2_1hzvzxepdw', 't2_1hzvzxepdw', 't2_1hq1grhz', 't2_xuav8', 't2_1hq1grhz'], 't2_sz3xedst': ['t2_1hzvzxepdw', 't2_1hzvzxepdw'], 't2_ruj6avkh': ['t2_1hzvzxepdw'], 't2_xntys': ['t2_1hzvzxepdw'], 't2_5uvd2idm': ['t2_1hzvzxepdw'], 't2_7bsnr8w3': ['t2_1hzvzxepdw'], 't2_zyey9': ['t2_1hzvzxepdw'], 't2_10ixnsyzhm': ['t2_1hzvzxepdw'], 't2_w0eb509c': [

In [42]:
with open('reddit_reply_to_complete.json', 'w') as f:
    json.dump(comment_replies, f)

### Create user dict with bot prob

In [235]:
bot_data = pd.read_csv('./reddit_comments_bot_prob.csv')

In [None]:
bot_data[:10]

Observe the above dataframe, the userid is comment id. Hence, if an indivisual user posted multiple times, the user may receive multiple, contradicting bot classification. We want to calculate the average of bot probability of the user and use that as our standard. We also want to be able to use different thresholds for bot qualification. We compute the boolean values for threshold at 0.5, 0.6, 0.7, 0.8, and 0.9 and store the information in a csv file.

In [237]:
bot_data['commentid'] = 't1_' + bot_data['userid']

In [238]:
del bot_data['userid']

In [None]:
bot_data[:10]

In [240]:
matched_df = bot_data.merge(comments_df, left_on='commentid', right_on='name')

In [None]:
matched_df[:10]

In [242]:
averaged_bot_prob = matched_df.groupby('author_fullname', as_index=True)['botprobability'].mean().to_frame().reset_index()

In [None]:
averaged_bot_prob[:10]

In [None]:
averaged_bot_prob.shape

In [245]:
averaged_bot_prob['thr_50'] = averaged_bot_prob['botprobability'] > 0.5
averaged_bot_prob['thr_60'] = averaged_bot_prob['botprobability'] > 0.6
averaged_bot_prob['thr_70'] = averaged_bot_prob['botprobability'] > 0.7
averaged_bot_prob['thr_80'] = averaged_bot_prob['botprobability'] > 0.8
averaged_bot_prob['thr_90'] = averaged_bot_prob['botprobability'] > 0.9

In [None]:
averaged_bot_prob[:10]

In [247]:
averaged_bot_prob.to_csv('reddit_averaged_bot_prob.csv', index=False) 

### Individual Post Data 

We create files containing "who did this user reply to" information needede to plot network graphs for individual posts using the following code. First, we work with an example, this is the second post from the posts_df.

In [None]:
post = posts_df.iloc[1]['name']
post_user = posts_df.iloc[1]['author_fullname']
post

In [None]:
# Extract relevant information from comments_df
comment_replies = {}

# Extract relevant comments
filtered_comments = comments_df[comments_df['link_id'] == post]
filtered_comments.shape

# Convert comments into lookup dictionaries
comment_authors = filtered_comments.set_index('name')['author_fullname'].to_dict()

for _, row in filtered_comments.iterrows():
    user = row['author_fullname']
    parent_id = row['parent_id']

    if parent_id.startswith("t1_"):  # Replying to another comment
        parent_user = comment_authors.get(parent_id, None)
    elif parent_id == post:  # Replying to a post
        parent_user = post_user
    else:
        parent_user = None

    if parent_user:
        if user not in comment_replies:
            comment_replies[user] = []
        comment_replies[user].append(parent_user)

# Convert sets to lists for better usability
comment_replies = {user: list(replied_to) for user, replied_to in comment_replies.items()}

print(comment_replies)


In [250]:
with open(f'reddit_reply_to_{post}.json', 'w') as f:
    json.dump(comment_replies, f)

We can define the process above to a function:

In [251]:
def post_specific(i):
    post = posts_df.iloc[i]['name']
    post_user = posts_df.iloc[i]['author_fullname']
    comment_replies = {}
    filtered_comments = comments_df[comments_df['link_id'] == post]
    comment_authors = filtered_comments.set_index('name')['author_fullname'].to_dict()

    for _, row in filtered_comments.iterrows():
        user = row['author_fullname']
        parent_id = row['parent_id']

        if parent_id.startswith("t1_"):  # Replying to another comment
            parent_user = comment_authors.get(parent_id, None)
        elif parent_id == post:  # Replying to a post
            parent_user = post_user
        else:
            parent_user = None

        if parent_user:
            if user not in comment_replies:
                comment_replies[user] = []
            comment_replies[user].append(parent_user)

    comment_replies = {user: list(replied_to) for user, replied_to in comment_replies.items()}

    with open(f'reddit_reply_to_{post}.json', 'w') as f:
        json.dump(comment_replies, f)

We generated the JSON file containing reply to relationship information for first 10 posts.

In [252]:
for i in range(11):
    post_specific(i)