In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
reddit = pd.read_csv('../../data/reddit_data/AskReddit_AskScience.csv')

In [4]:
reddit.shape

(1916, 3)

In [5]:
reddit[(reddit['comment'] != '[removed]') & (reddit['comment'] != '[deleted]')].shape # 111 removed or deleted comments

(1805, 3)

In [7]:
reddit[(reddit['comment'] != '[removed]') & (reddit['comment'] != '[deleted]')]['title'].is_unique # at least one duped title in remaining threads

False

In [8]:
reddit[(reddit['comment'] != '[removed]') & (reddit['comment'] != '[deleted]')]['comment'].is_unique # at least one duped comment as well

False

In [9]:
reddit[(reddit['comment'] != '[removed]') & (reddit['comment'] != '[deleted]')]['title'].duplicated().sum() # 23 duplicated thread titles? That's too many for just reposting

23

In [10]:
reddit[(reddit['comment'] != '[removed]') & (reddit['comment'] != '[deleted]')]['comment'].duplicated().sum() # but only one duplicated comment

1

In [11]:
reddit_remaining = reddit[(reddit['comment'] != '[removed]') & (reddit['comment'] != '[deleted]')]

In [12]:
reddit_remaining.loc[reddit_remaining['comment'].duplicated()]

Unnamed: 0,subreddit,title,comment
705,AskReddit,What Company would you Like to Go Bankrupt?,Ticketmaster


In [13]:
reddit_remaining.loc[reddit_remaining['comment'] == 'Ticketmaster', :] # The thread's are similarly worded but the duplicated response isn't invalid. Reddit comments are repetitive so this is par for the course

Unnamed: 0,subreddit,title,comment
506,AskReddit,If you could bankrupt one company which one wo...,Ticketmaster
705,AskReddit,What Company would you Like to Go Bankrupt?,Ticketmaster


In [14]:
reddit_remaining.loc[reddit_remaining['title'].duplicated()] # Turns out askscience has a recurring weekly thread that works differently than a typical thread
                                                             # here the main comments are questions and not answers so I will filter these out since they don't fit.
                                                             # This does mean that my already truncated askscience group will be shortened but it is what it is.
                                                             # The remaining duplicates are just some users trying to get karma by reposting popular questions so they get to stay.
                                                             # I do need to isolate the one repeated AskScience thread however.

Unnamed: 0,subreddit,title,comment
397,AskReddit,What did you not know about sex until you lost...,your family can hear you.
975,AskReddit,What are you convinced people are pretending t...,Posting on LinkedIn
995,AskReddit,What's the worst sex you've ever had?,Got puked on while she was on top straight up ...
1337,AskScience,"Ask Anything Wednesday - Engineering, Mathemat...",What is the estimated mass of all animal life ...
1379,AskScience,"Ask Anything Wednesday - Engineering, Mathemat...",How far away are we from reaching the limit CP...
1420,AskScience,"Ask Anything Wednesday - Economics, Political ...",In the Sherlock Holmes books numbers are often...
1440,AskScience,"Ask Anything Wednesday - Engineering, Mathemat...","Biology: we can live for weeks without eating,..."
1541,AskScience,"Ask Anything Wednesday - Engineering, Mathemat...",How are questions structured in a quantum comp...
1555,AskScience,"Ask Anything Wednesday - Physics, Astronomy, E...",Why is there such an obsession with colonizing...
1586,AskScience,"Ask Anything Wednesday - Engineering, Mathemat...",I'm self taught in some very basic computer pr...


In [15]:
repeated_askscience = reddit_remaining.loc[(reddit_remaining['title'].duplicated()) & (reddit_remaining['subreddit'] == 'AskScience') & (reddit_remaining['title'] != 'What is the smallest possible black hole?')]['title'].tolist()

In [16]:
reddit_no_dupes = reddit_remaining.loc[~reddit_remaining['title'].isin(repeated_askscience)]  # removes all duplicated askScience weekly threads

In [17]:
reddit_no_dupes.shape # Leaves me barely on track to get to 5000 with my remaining subreddits which is moderately troubling

(1782, 3)

Some titles have [Serious] or [NSFW] tags, testing using the browser chatGPT shows the [NSFW] tag can break responses. [Serious] is somewhat important however so I want to keep that tag.

In [20]:
no_nsfw_brackets = reddit_no_dupes['title'].copy(deep = True).apply(lambda x: re.sub(r'\[NSFW\]', '', x))

In [21]:
no_nsfw_brackets.shape

(1782,)

In [22]:
no_nsfw_tags = no_nsfw_brackets.copy(deep = True).apply(lambda x: re.sub(r'\(NSFW\)', '', x)) # one post used parenthesis for some reason so lets fix that too. I also checked [nsfw] and (nsfw) which had no occurences

In [23]:
no_nsfw_tags.shape

(1782,)

In [24]:
reddit_no_dupes.loc[:, 'title'] = no_nsfw_tags.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit_no_dupes.loc[:, 'title'] = no_nsfw_tags.tolist()


In [25]:
reddit_no_dupes[reddit_no_dupes['title'].str.contains('\[NSFW\]')]

Unnamed: 0,subreddit,title,comment


In [26]:
#reddit_no_dupes.to_csv('../cleaned_reddit_data/cleaned_ask_reddit_science.csv', index = False)

In [47]:
ah = pd.read_csv('AskHistory.csv')
eli5 = pd.read_csv('explainlikeimfive.csv')
nsq = pd.read_csv('NoStupidQuestions.csv')
dae = pd.read_csv('DoesAnybodyElse.csv')

In [48]:
ah.shape

(938, 3)

In [49]:
eli5.shape

(1000, 3)

In [50]:
nsq.shape

(999, 3)

In [51]:
dae.shape

(1000, 3)

Below is the same process as above but applied to the remaining data once it was gathered.

In [52]:
ah_remaining = ah[(ah['comment'] != '[removed]') & (ah['comment'] != '[deleted]')]
eli5_remaining = eli5[(eli5['comment'] != '[removed]') & (eli5['comment'] != '[deleted]')]
nsq_remaining  = nsq[(nsq['comment'] != '[removed]') & (nsq['comment'] != '[deleted]')]
dae_remaining = dae[(dae['comment'] != '[removed]') & (dae['comment'] != '[deleted]')]

In [53]:
print(f"AskHistorians rows: {ah_remaining.shape[0]}") # Ask Historians has a ton of removed top comments because they have issues with brigading and misinformation due to political biases
print(f"ELI5 rows: {eli5_remaining.shape[0]}")
print(f"NSQ rows: {nsq_remaining.shape[0]}")
print(f"DAE rows: {dae_remaining.shape[0]}")

AskHistorians rows: 776
ELI5 rows: 926
NSQ rows: 960
DAE rows: 976


In [54]:
# merging them all into one to save a bit of effort
merged_remaining = pd.concat([ah_remaining, eli5_remaining, nsq_remaining, dae_remaining], axis = 0)

In [55]:
merged_remaining.shape

(3638, 3)

In [56]:
merged_remaining['title'].duplicated().sum() # 2 duplicated thread titles

2

In [58]:
merged_remaining['comment'].duplicated().sum() # 6 duplicated comments

6

In [59]:
merged_remaining.loc[(merged_remaining['title'].duplicated()) | (merged_remaining['comment'].duplicated())]  # Looks like ELI5 had some users that deleted their posts due to blackouts using a script in some way
                                                                                                             # The two DAE repeats are interesting, likely those are the two thread repeats for karma farming reasons

Unnamed: 0,subreddit,title,comment
153,explainlikeimfive,eli5: Since caffeine doesn’t actually give you...,This post removed in protest. Visit /r/Save3rd...
194,explainlikeimfive,ELI5: How did global carbon dioxide emissions ...,This post removed in protest. Visit /r/Save3rd...
312,explainlikeimfive,ELI5 those gold/silver emergency blankets: do ...,This post removed in protest. Visit /r/Save3rd...
679,explainlikeimfive,"ELI5: What is a bad faith arguement, exactly?",This post removed in protest. Visit /r/Save3rd...
736,explainlikeimfive,ELI5: Why is Helium so difficult to synthesize?,This post removed in protest. Visit /r/Save3rd...
875,explainlikeimfive,Eli5 How does radiation therapy actually work?...,This post removed in protest. Visit /r/Save3rd...
55,DoesAnybodyElse,DAE type out a whole post or reply but immedia...,For me it's more like 'wow no one gives a shit...
168,DoesAnybodyElse,"DAE think that, from a certain age, elderly pe...",Honestly I think that all drivers should be te...


In [61]:
dupes = merged_remaining.loc[(merged_remaining['title'].duplicated()) | (merged_remaining['comment'].duplicated())]

In [71]:
merged_remaining.loc[merged_remaining['title'] == dupes['title'][168]] # The threads have different reponses so its not that unfair to include both I think, the temperature param SHOULD make the answers different by a bit

Unnamed: 0,subreddit,title,comment
0,DoesAnybodyElse,"DAE think that, from a certain age, elderly pe...","In NZ ""You need to renew your driver licence a..."
168,DoesAnybodyElse,"DAE think that, from a certain age, elderly pe...",Honestly I think that all drivers should be te...


In [73]:
merged_remaining.loc[merged_remaining['title'] == dupes['title'][55]] # Again a repost with a different answer so I'll include this as well

Unnamed: 0,subreddit,title,comment
2,DoesAnybodyElse,DAE type out a whole post or reply but immedia...,How many times did you delete and retype this ...
55,DoesAnybodyElse,DAE type out a whole post or reply but immedia...,For me it's more like 'wow no one gives a shit...


In [74]:
merged_remaining.loc[merged_remaining['comment'] == dupes['comment'][153]] # these need to go however

Unnamed: 0,subreddit,title,comment
98,explainlikeimfive,eli5 With billions and billions of people over...,This post removed in protest. Visit /r/Save3rd...
153,explainlikeimfive,eli5: Since caffeine doesn’t actually give you...,This post removed in protest. Visit /r/Save3rd...
194,explainlikeimfive,ELI5: How did global carbon dioxide emissions ...,This post removed in protest. Visit /r/Save3rd...
312,explainlikeimfive,ELI5 those gold/silver emergency blankets: do ...,This post removed in protest. Visit /r/Save3rd...
679,explainlikeimfive,"ELI5: What is a bad faith arguement, exactly?",This post removed in protest. Visit /r/Save3rd...
736,explainlikeimfive,ELI5: Why is Helium so difficult to synthesize?,This post removed in protest. Visit /r/Save3rd...
875,explainlikeimfive,Eli5 How does radiation therapy actually work?...,This post removed in protest. Visit /r/Save3rd...


In [75]:
merged_no_dupes = merged_remaining.loc[merged_remaining['comment'] != dupes['comment'][153]]  # removes all duplicated reddit blackout protest comments

In [76]:
merged_no_dupes.shape # Filtered out the 7 removed comments

(3631, 3)

In [78]:
merged_no_dupes['title'][merged_no_dupes['title'].str.contains('\[NSFW\]')] # One thread has [NSFW]

656    [NSFW] Why does everyone make it seem that onc...
Name: title, dtype: object

In [79]:
merged_no_dupes['title'][merged_no_dupes['title'].str.contains('\(NSFW\)')] # Another one with parentheses

88    DAE (NSFW) Have a porn video that they watched...
Name: title, dtype: object

In [82]:
merged_no_dupes['title'][merged_no_dupes['title'].str.contains('\[SERIOUS\]')] # One with SERIOUS so that will need to go

165    [SERIOUS] How do i desert the army? I'm russia...
Name: title, dtype: object

In [84]:
cleaned_titles = merged_no_dupes['title'].apply(lambda x: re.sub('\(NSFW\)', '', re.sub('\[SERIOUS\]', '', re.sub(r'\[NSFW\]', '', x))))

In [85]:
merged_no_dupes.loc[:, 'title'] = cleaned_titles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_no_dupes.loc[:, 'title'] = cleaned_titles


In [86]:
merged_no_dupes.head()

Unnamed: 0,subreddit,title,comment
0,AskHistorians,Could people do backflips/front flips in ancie...,Some of the earliest images of people doing fl...
1,AskHistorians,"How did the USA go from robust trade unions, a...","Unfortunately on taxation rates, the presumpti..."
2,AskHistorians,What pop history book has done the most damage...,"""The Gangs of New York"" is a very entertaining..."
3,AskHistorians,How would a professional historian look for th...,# Gandalf’s Reference Desk Query\n\nA little r...
4,AskHistorians,The aftermath of Kanye West’s antisemitic rhet...,Black Hebrew mythology originates from the sam...


In [87]:
#merged_no_dupes.to_csv('../cleaned_reddit_data/cleaned_remaining_subs.csv', index = False)