In [83]:
import datetime
import json
import lxml.html
from lxml.html.clean import Cleaner
import requests
import time
from  tqdm import tqdm

LW_API_URL = 'https://www.lesswrong.com/graphql'

cleaner = Cleaner(allow_tags=[''], kill_tags=['style', 'script'])
def clean_html(text):
    return cleaner.clean_html(lxml.html.fromstring(text)).text
    
def get_posts(before, span = datetime.timedelta(weeks=1), clean=True):
    
    after = before - span
    
    before_s = before.strftime('%Y-%m-%d')
    after_s = after.strftime('%Y-%m-%d')
    
    posts_query = """
    {
      posts(input: {
        terms: {
          view: "new"
          meta: null  
          before: "%s"
          after: "%s"
        }
      }) {
        results {
          htmlBody
          title
          baseScore
          af
          _id
          userId
        }
      }
    }
    """ % (before_s, after_s)
    
    response = requests.post(LW_API_URL, json = {'query': posts_query})
    
    try: response_data = response.json()['data']['posts']['results']
    except: return []

    posts = [convert_post(post, clean=clean) for post in response_data]
    
    return posts

def convert_post(post, clean=True):
    key_mapping = {
        'htmlBody': 'text',
        'title': 'title',
        'baseScore': 'karma',
        'af': 'af',
        '_id': 'id',
        'userId': 'userid'
    }
    
    converted = {v: post[k] for k, v in key_mapping.items()}
    if clean: 
        try: converted['text'] = clean_html(converted['text'])
        except: pass
    
    return converted

def create_corpus(n_posts=100, filename='lw_corpus.json', span=datetime.timedelta(weeks=1), clean=True, start=None, sleep=0):
    if not start: start = datetime.datetime.now()
    
    posts = []
    
    with tqdm(total=n_posts) as pbar:
        while len(posts) < n_posts:
            if sleep: time.sleep(sleep)
            new_posts = get_posts(start, clean=clean)
            posts += new_posts
            start = start - span
            pbar.update(len(new_posts))

    with open(filename, 'w') as f:
        json.dump(posts, f)
        
    return posts

def create_corpus_jsonl(n_posts=100, filename='lw_corpus.jsonl', span=datetime.timedelta(weeks=1), clean=True, start=None, sleep=0):
    if not start: start = datetime.datetime.now()
    
    posts_so_far = 0
    with tqdm(total=n_posts) as pbar:
        while posts_so_far < n_posts:
            if sleep: time.sleep(sleep)
            
            posts = get_posts(start, clean=clean)
            posts_so_far += len(posts)
            
            start = start - span
            pbar.update(len(posts))

            with open(filename, 'a') as f:
                lines = [json.dumps(post) + '\n' for post in posts]
                f.writelines(lines)
        

In [78]:
#corpus = create_corpus(10000, span=datetime.timedelta(weeks=2), sleep=1)

10006it [09:09, 18.21it/s]                                                


In [84]:
create_corpus_jsonl()

163it [00:06, 26.68it/s]                                                  
