In [1]:
%reset -f

In [73]:
import requests
import praw
from praw.models import MoreComments 
import pandas as pd
from collections import defaultdict
from bs4 import BeautifulSoup

In [74]:
%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [75]:
# Create read only instance
import json 
json_file = "reddit_api_config.json"
json_key = "reddit_api"
with open(json_file) as f:
    data = json.load(f)
user_values = data[json_key]

reddit = praw.Reddit(
    client_id=user_values['client_id'],
    client_secret=user_values['client_secret'],
    user_agent=user_values['user_agent'],
)


In [76]:
# Test if the read only instance was created properly 
print(reddit.read_only)

True


In [77]:
# Select the subreddit 
subreddit = reddit.subreddit("Accounting").stream
# For this case, we have a specific url that we would like to work on 
url = "https://www.reddit.com/r/Accounting/comments/o6c81e/pwc_2021_compensation_thread/"
# Create submission object bsaed on the url 
submission = reddit.submission(url=url)

For this url, the breakdown of the category is as below 
1. Market/Office
2. Trust or Consulting Solutions and LOS/Vertical
3. CY Level -> FY22 Level (A1>A2, S1->S2, S3->M1, etc)
4. Rating
5. Old Salary -> New Salary
6. Bonus
7. Interesting notes on what RLs/RPs have told you related to future comp.
8. Anything else? (opinions on the cohort model for all LOS, opinions on the new equation, etc)

In [78]:
# create a dictionary to organize the submission comments
category_list = [
    "Location",
    "Type",
    "Change in level",
    "Rating",
    "Salary",
    "Bonus",
    "Notes",
    "Other",
]
# numbering for the category list
num_keys = range (1, len(category_list)+1)

# create looup dictionary to identify category based on the number 
lookup_dict = dict(zip(num_keys, category_list))

# create list of Falses 
logic_list = [False]*len(category_list)

# this dictionary will keep track of which items are responded
check_dict = dict(zip(num_keys, logic_list))

In [79]:
print(submission.selftext_html)
# print(Beautiful)


<!-- SC_OFF --><div class="md"><p>Alright folks, looks like a good number of people are getting their comp information over the next few days. We’ve seen good assurance, I mean Trust Solutions Assurance, bumps, what about the rest of us?</p>

<ol>
<li>Market/Office</li>
<li>Trust or Consulting Solutions and LOS/Vertical</li>
<li>CY Level -&gt; FY22 Level (A1&gt;A2, S1-&gt;S2, S3-&gt;M1, etc)</li>
<li>Rating</li>
<li>Old Salary -&gt; New Salary</li>
<li>Bonus</li>
<li>Interesting notes on what RLs/RPs have told you related to future comp.</li>
<li>Anything else? (opinions on the cohort model for all LOS, opinions on the new equation, etc)</li>
</ol>
</div><!-- SC_ON -->


In [80]:
# using the defaultdict library, create a dictionary of list 
# Ex. {"Location": ["NY", "HCOL"]}
submission_dict = defaultdict(list)

In [81]:
# obtain the submission comments object 
# comments = submission.comments.replace_more(limit=None, threshold=0)
comments = submission.comments


In [82]:
# Need to call replace_more in order to avoid the Morecomments error
comments.replace_more(limit=None)
# for each comment in the comments object
for comment in comments: 
    # print(f"worked on the {comment.body}")
    # obtain the comment body 
    content = comment.body
    # split the contents line by line  
    content = content.split("\n")
    # remove empty strings from the list 
    content = list(filter(None, content))
    # only select line that start with the numbers 
    content = [i for i in content if i[0].isnumeric()]
    # if the list is blank 
    if not content: 
        #skip that item
        continue
    # create list of numbers from dict keys
    key_list = str(list(lookup_dict.keys()))
    # for each line in the content, or the comment 
    for line in content:
        # the number is always the first character 
        heading_number = float(line[0])
        # obtain the category using the look up table 
        if check_dict[heading_number] == True: 
            continue
        category = lookup_dict[heading_number]
        check_dict[heading_number] = True
        # append the response based on the category key 
        submission_dict[category].append(line.strip()[2:].strip())
        print(len(submission_dict[category]))

        # print(f"worked on the {line}")

    for number in num_keys:
        if check_dict[number]:
            check_dict[number]=False
            print(f"did respond {number}")
            # continue
        else:
            # print(number)
            category = lookup_dict[number]
            # print("category")
            # print(f"before {submission_dict[category]}")
            submission_dict[category].append("")
            # print(f"after {submission_dict[category]}")
            check_dict[number]=False
            # print(f"did not respond {number}")
            # continue
    
    # we need to be careful because some people do not post response for all the question. Therefore, if they skip one or 

1
1
1
1
1
1
1
1
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
did respond 7
did respond 8
2
2
2
2
2
2
2
2
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
did respond 7
did respond 8
3
3
3
3
3
3
3
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
did respond 7
4
4
4
4
4
4
4
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
did respond 7
5
5
5
5
5
5
5
5
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
did respond 7
did respond 8
6
6
6
6
6
6
6
6
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
did respond 7
did respond 8
7
7
7
7
7
7
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
8
8
8
8
8
8
8
did respond 1
did respond 2
did respond 3
did respond 4
did respond 5
did respond 6
did respond 7
9
9
9
9
9
did respond 1
did respond 2
did respond 3
did 

In [83]:
for key in list(submission_dict.keys()):
    print(len(submission_dict[key]))

87
87
87
87
87
87
87
87


In [84]:
df = pd.DataFrame(submission_dict)
df.head()
df.to_csv("test.csv")