# Exploratory Data Analysis

Hypothesis: What is this notebook about?

In [1]:
# autoreload your package
%load_ext autoreload
%autoreload 2
import rrational


In [2]:
## secrets
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

import warnings
# warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", ".*does not have many workers.*")
warnings.filterwarnings("ignore", ".*divide by zero.*")

## numeric, plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (7.0, 4)

## utils
from pathlib import Path
from tqdm.auto import tqdm
import logging, os, re
import collections, functools, itertools

# logging
from loguru import logger
logger.remove()
logger.add(os.sys.stdout, level="ERROR", colorize=True, format="<level>{time} | {message}</level>")


1

In [3]:
# dotenv
import os
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

True

In [4]:

import praw

reddit = praw.Reddit(
    client_id=os.environ["CLIENT_ID"],
    client_secret=os.environ["CLIENT_SECRET"],
    password=os.environ["PASSWORD"],
    user_agent="testscript by u/fakebot3",
    username=os.environ["USERNAME"],
)

In [5]:
s = reddit.subreddit("rational")
s

Subreddit(display_name='rational')

In [6]:
# TODO also add
# Comic Recommendation thread             : bit.ly/1SPwDfz
# 'Time Travel' Recommendation thread     : bit.ly/1VVcNyH
# Worm Fanfiction Recommendation thread   : bit.ly/1VVd0lB
# Obscure(sic) Recommendation thread      : bit.ly/1PXBvtR
# General Fanfiction Recommendation thread: bit.ly/1SxDNXq 
# Miscellaneous Recommendation thread     : bit.ly/1PXB9n6

Note reddit tries to prevent scraping by making it so we have no way to list all submissions. Here's a workaround

- search: https://old.reddit.com/r/rational/search?q=%22Monday+Request+and+Recommendation+Thread%22&restrict_sr=on&sort=new&t=all#res:ner-page=10
- keep paging down (I got 4 years and 250 things)
- paste this into the console

```
a=$('a.search-title')
b=a.toArray().map(u=>u.href.split('/')[6])
copy(b)
```

In [7]:
submissions = list(s.search("Recommendation Thread"))
print(len(submissions))

100


In [8]:
import json
ids_manual = json.load(open('../data/submissions.json'))
submissions += [reddit.submission(id) for id in ids_manual]
submissions

[Submission(id='1e3vhth'),
 Submission(id='1ef093v'),
 Submission(id='1ew2jku'),
 Submission(id='1f1ownr'),
 Submission(id='1b6cz68'),
 Submission(id='1byylp5'),
 Submission(id='1bt4gkn'),
 Submission(id='1cabj2s'),
 Submission(id='1d1s69f'),
 Submission(id='1dcm7hq'),
 Submission(id='1ekoil8'),
 Submission(id='1eqehs7'),
 Submission(id='1ap1xea'),
 Submission(id='1dne6df'),
 Submission(id='1dhys6k'),
 Submission(id='1bc3v36'),
 Submission(id='1d754n0'),
 Submission(id='1cwgahd'),
 Submission(id='197apaz'),
 Submission(id='1cr00xb'),
 Submission(id='1f77g8u'),
 Submission(id='1adxacl'),
 Submission(id='1e9f0la'),
 Submission(id='1c4mxk4'),
 Submission(id='1dstocq'),
 Submission(id='10v8os8'),
 Submission(id='1ajihyf'),
 Submission(id='1cljrk3'),
 Submission(id='191n3o1'),
 Submission(id='1dy9n53'),
 Submission(id='1aupa48'),
 Submission(id='1854wdx'),
 Submission(id='14pjuec'),
 Submission(id='147ouxh'),
 Submission(id='1bhrf53'),
 Submission(id='11qai9r'),
 Submission(id='17el34d'),
 

In [9]:
ids_manual2 = json.load(open('../data/submissions_org_mode.json'))
submissions += [reddit.submission(id) for id in ids_manual2]
submissions

[Submission(id='1e3vhth'),
 Submission(id='1ef093v'),
 Submission(id='1ew2jku'),
 Submission(id='1f1ownr'),
 Submission(id='1b6cz68'),
 Submission(id='1byylp5'),
 Submission(id='1bt4gkn'),
 Submission(id='1cabj2s'),
 Submission(id='1d1s69f'),
 Submission(id='1dcm7hq'),
 Submission(id='1ekoil8'),
 Submission(id='1eqehs7'),
 Submission(id='1ap1xea'),
 Submission(id='1dne6df'),
 Submission(id='1dhys6k'),
 Submission(id='1bc3v36'),
 Submission(id='1d754n0'),
 Submission(id='1cwgahd'),
 Submission(id='197apaz'),
 Submission(id='1cr00xb'),
 Submission(id='1f77g8u'),
 Submission(id='1adxacl'),
 Submission(id='1e9f0la'),
 Submission(id='1c4mxk4'),
 Submission(id='1dstocq'),
 Submission(id='10v8os8'),
 Submission(id='1ajihyf'),
 Submission(id='1cljrk3'),
 Submission(id='191n3o1'),
 Submission(id='1dy9n53'),
 Submission(id='1aupa48'),
 Submission(id='1854wdx'),
 Submission(id='14pjuec'),
 Submission(id='147ouxh'),
 Submission(id='1bhrf53'),
 Submission(id='11qai9r'),
 Submission(id='17el34d'),
 

In [10]:
submissions += list(s.search("Monday Request and Recommendation Thread"))
print(len(submissions))
# submissions

10510


In [11]:
# from https://github.dev/JosefAlbers/rd2md

import textwrap
from datetime import datetime


def format_comment(comment, depth=0, upvote_threshold=2):
    if comment.score < upvote_threshold:
        return ""
    indent = "  " * depth
    author_line = f"{indent}- u/{comment.author}:\n"
    dedented_body = textwrap.dedent(comment.body).strip()
    indented_body = textwrap.indent(dedented_body, indent + '  ')
    comment_block = f"{indent + '  '}```\n{indented_body}\n{indent + '  '}```\n\n"
    formatted = author_line + comment_block
    for reply in comment.replies:
        formatted += format_comment(reply, depth + 1, upvote_threshold)

    return formatted


def submission_to_markdown(post, comment_score_threshold=0, verbose=False):
    post_content = []
    post_content.append(f"## {post.title}\n\n")
    if verbose:
        post_content.append(f"* Author: u/{post.author}\n")
        post_content.append(f"* URL: {post.url}\n")
        post_content.append(f"* Score: {post.score}\n\n")
    post_content.append("### Post:\n\n")
    if post.is_self:
        content = post.selftext
        content = content.replace('\n#', '\n####') # md headings
        post_content.append(f"{content}\n\n")
    else:
        post_content.append(f"[Link to content]({post.url})\n\n")
    post_content.append("### Comments:\n\n")
    post.comments.replace_more(limit=None)
    for comment in post.comments:
        post_content.append(format_comment(comment, upvote_threshold=comment_score_threshold))
    post_content.append("---\n\n")
    return ''.join(post_content)




In [12]:
outdir = Path("../data/cache")

In [13]:
for submission in tqdm(submissions):
    f = outdir / f"{submission.id}.md"
    if not f.exists():
        md = submission_to_markdown(submission, 2)
        f.write_text(md)

  0%|          | 0/10510 [00:00<?, ?it/s]