https://github.com/huggingface/transformers/blob/0ae96ff8a7e2d371242452d81bee85da8df202f5/examples/text-generation/run_generation.py

# get a bunch of user comments

In [1]:
# check praw.ini in root
import praw

In [2]:
reddit = praw.Reddit()

In [3]:
user_name = 'suncoasthost'

In [4]:
user = reddit.redditor(user_name)

In [5]:
def get_context(comment, reddit):
    submission = reddit.submission(id=comment.link_id.replace('t3_', ''))
    parent_comment = None
    if not comment.parent_id == comment.link_id:
        # not a top level comment, try to retrieve parent comment
        parent_comment = reddit.comment(id=comment.parent_id.replace('t1_', ''))
    return parent_comment, submission

def get_all_context(comment, reddit):
    parent_comment, submission = get_context(comment, reddit)
    if parent_comment is None:
        # base case: is a top level comment
        parent_comments = []
        return parent_comments, submission
    else:
        parent_comments, submission = get_all_context(parent_comment, reddit)
        return (parent_comments + [parent_comment], submission)

In [6]:
import os
import json
import pickle

In [7]:
from datetime import datetime

In [22]:
def format_comment_as_json(comment):
    return {
        'id': comment.id, 
        'author': comment.author.name if comment.author is not None else None, 
        'body': comment.body, 
        'created_utc': comment.created_utc,
        'permalink': comment.permalink,
    }

def format_submission_as_json(submission):
    return {
        'id': submission.id,
        'subreddit': submission.subreddit.display_name,
        'title': submission.title,
        'selftext': submission.selftext,
        'permalink': submission.permalink,
        
    }

In [23]:
import pandas as pd

In [25]:
outpath = 'data/user/{}'.format(user_name)
os.makedirs(outpath, exist_ok=True)

manifestpath = os.path.join('data/user/{}/manifest.csv'.format(user_name))
if not os.path.isfile(manifestpath):
    manifestdf = None
    with open(manifestpath, 'w+') as f:
        f.write('comment_id, created_utcnow_isoformat\n')
else:
    manifestdf = pd.read_csv(manifestpath)

# args for user.comments.new()
# https://praw.readthedocs.io/en/latest/code_overview/other/listinggenerator.html#praw.models.ListingGenerator
# limit – default 100, max 1000
limit = None
i = 0
for comment in user.comments.new(limit=limit):
    print ('[{}/{}] id: {}, body: {}'.format(
        i, limit, comment.id, comment.body.replace('\n', ' ').replace('\t', ' ')[:50]
    ))
    i += 1
    if manifestdf is not None and comment.id in list(manifestdf['comment_id']):
        print ('skip since comment dump exists...')
        continue
    else:
        parent_comment, submission = get_context(comment, reddit)
        package = {
            'comment': format_comment_as_json(comment),
            'parent_comment': format_comment_as_json(parent_comment) if parent_comment is not None else None,
            'submission': format_submission_as_json(submission)
        }
        with open(manifestpath, 'a+') as f:
            f.write('{}, {}\n'.format(comment.id, datetime.utcnow().isoformat()))
        with open(os.path.join(outpath, '{}.json'.format(comment.id)), 'w+') as f:
            json.dump(package, f, indent=4)

[0/None] id: fwbahg8, body: not my choice. was the cto
[1/None] id: fw9xf8b, body: depends on your hosting situation. I am hosted on 
[2/None] id: fw9q6m5, body: it is but it scales up easy. Major organizations u
[3/None] id: fw6ewe6, body: Since you are using Angular for your front end (cl
[4/None] id: fvwmkq9, body: True, I only mentioned it because OP said they wer
[5/None] id: fvw3s2e, body: Wordpress itself offers pretty affordable hosting.
[6/None] id: fvuueav, body: check to see if your server instance is required a
[7/None] id: fvtdytg, body: line 8 is improper syntax.   set the “port” proper
[8/None] id: fvhkuri, body: It might be worth taking a hard look at yourself a
[9/None] id: fvh3dra, body: I recommend HTML5 Blank.
[10/None] id: fve1jrp, body: You need to narrow down the requirements of your A
[11/None] id: fvc7r42, body: The problem with passing the cookie is that it can
[12/None] id: fv8g2qn, body: storing JWT in local storage is not inherently uns
[13/None] id: fv74pe

[109/None] id: ej9lbxg, body: IMO classes in Javascript are not as fully feature
[110/None] id: ej9l11z, body: Don’t move from a topic or subject unless you full
[111/None] id: ej9kv5z, body: I recommend HTML5 Blank Theme to start. It’s a bas
[112/None] id: ej710tk, body: Ok I don’t think I was understanding your request 
[113/None] id: ej6ajpl, body: Wordpress is built with php not node.js. You typic
[114/None] id: ej683bk, body: You can do that pretty easily. This link explains 
[115/None] id: ej670j6, body: you could use $.load function to load the page int
[116/None] id: eikuhmy, body: I agree sockets is the way to go here. You could e
[117/None] id: ehnzxob, body: I recommend Vue.js. There is strong support and pl
[118/None] id: eex40n8, body: http://html5blank.com/ they have really good docum
[119/None] id: eex2t5w, body: I use Blank HTML 5 theme for WordPress. With that 
[120/None] id: eere501, body: I like Elementor but recently switched to Brizy. B
[121/None] id: eentozl, body

In [26]:
!gsutil -m rsync -r data/user gs://astroturf-dev/user


both the source and destination. Your crcmod installation isn't using the
module's C extension, so checksumming will run very slowly. If this is your
first rsync since updating gsutil, this rsync can take significantly longer than
usual. For help installing the extension, please see "gsutil help crcmod".

Building synchronization state...
Starting synchronization...
Copying file://data/user/.DS_Store [Content-Type=application/octet-stream]...
Copying file://data/user/suncoasthost/e39e35r.json [Content-Type=application/json]...
Copying file://data/user/suncoasthost/e3cjvc1.json [Content-Type=application/json]...
Copying file://data/user/suncoasthost/e3gwflw.json [Content-Type=application/json]...
Copying file://data/user/suncoasthost/e3n9dyy.json [Content-Type=application/json]...
Copying file://data/user/suncoasthost/e3n9i86.json [Content-Type=application/json]...
Copying file://data/user/suncoasthost/e3n9pt6.json [Content-Type=application/json]...
Copying file://data/user/suncoasthos