In [3]:
import numpy as np
import pandas as pd

# Data Preprocessing

To handle the large dataset, read in the data in chunks

In [17]:
chunk_size = 10000
chunks = pd.read_csv('../data/Reddit-Threads_2020-2021.csv', chunksize=chunk_size)
chunks2 = pd.read_csv('../data/Reddit-Threads_2022-2023.csv', chunksize=chunk_size)
data_2021 = pd.concat(chunk for chunk in chunks)
data_2223 = pd.concat(chunk for chunk in chunks2)
combined_data = pd.concat([data_2021, data_2223]).reset_index(drop=True)

In [18]:
print(combined_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4938371 entries, 0 to 4938370
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   text          object
 1   timestamp     object
 2   username      object
 3   link          object
 4   link_id       object
 5   parent_id     object
 6   id            object
 7   subreddit_id  object
 8   moderation    object
dtypes: object(9)
memory usage: 339.1+ MB
None


In [19]:
missing_values = combined_data[combined_data.isna().any(axis=1)]
missing_values.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation
14357,“He who has a why to live for can bear almost ...,,,,,,,,
14358,“He who has a why to live for can bear almost ...,,,,,,,,
14359,“He who has a why to live for can bear almost ...,,,,,,,,
14360,“He who has a why to live for can bear almost ...,,,,,,,,
14361,“He who has a why to live for can bear almost ...,,,,,,,,


In [20]:
# drop missing values
combined_data = combined_data.dropna()

Extract `yearmonth` from `timestamp` for temporal analysis

In [21]:
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'])
combined_data['yearmonth'] = combined_data['timestamp'].dt.to_period('M')

# order by timestamp
combined_data = combined_data.sort_values(by='timestamp').reset_index(drop=True)

Extract thread's `title` for each comment

In [26]:
combined_data['title'] = combined_data['link'].apply(lambda x: x.split('/')[5] if isinstance(x, str) else None)
combined_data['title'] = combined_data['title'].str.replace('_', ' ')

In [27]:
combined_data.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation,yearmonth,title
0,SUTD entry requirements is more or less the sa...,2020-01-01 00:00:33,Twrd4321,/r/singapore/comments/eia0ap/sit_suss_or_sutd/...,t3_eia0ap,t1_fcom49y,fconqf5,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",2020-01,sit suss or sutd
1,"With jam like this, it's barely worth it.",2020-01-01 00:01:05,557088,/r/singapore/comments/ei5a1u/how_congested_the...,t3_ei5a1u,t1_fcnh5ns,fconrxl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",2020-01,how congested the ecp is on new years day
2,"I outside now, I guess reaching there around 0...",2020-01-01 00:12:57,HeavyArmsJin,/r/singapore/comments/ei8j4h/whats_the_earlies...,t3_ei8j4h,t3_ei8j4h,fcooqwc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",2020-01,whats the earliest timing i can visit the guan
3,The female or male cry?,2020-01-01 00:12:59,Tempestuous-,/r/singapore/comments/ei9klf/rsingapore_random...,t3_ei9klf,t1_fcohcxx,fcoor03,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",2020-01,rsingapore random discussion and small questions
4,bring an umbrella! it gets really hot during t...,2020-01-01 00:13:37,WiisdomTooth,/r/singapore/comments/ei2z6d/what_is_there_to_...,t3_ei2z6d,t3_ei2z6d,fcoosvb,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",2020-01,what is there to prepare for a trip to st johns


In [28]:
combined_data.to_csv('../data/combined_data.csv', index=False)