In [5]:
import requests
import datetime
import json
from lxml.html.clean import Cleaner
import lxml.html
from tqdm import tqdm

LW_GRAPHQL_URL = 'https://www.lesswrong.com/graphql'

def post_request(post_id):
    
    query = """
    {
      post(
        input: {
          selector: {
            _id: "%s"
          }
        }
      ) {
        result {
          htmlBody
          title
          baseScore
          af
          userId
          _id
        }
      }
    }
    """ % post_id
    
    response_dict = requests.post(LW_GRAPHQL_URL, json={'query': query}).json()
    data = response_dict['data']['post']['result']
    
    data = {'text': data['htmlBody'], 'title': data['title'], 'karma': data['baseScore'], 'af': data['af'], 'userid': data['userId'], 'id': data['_id']}
    
    return data


def posts_request(before = '2022-12-31', after = '2022-01-01'):
    
    query= """
    {
      posts(input: {
        terms: {
          view: "new"
          meta: null 
          
          before: "%s"
          after: "%s"
        }
      }) {
        results {
          _id
        }
      }
    }
    """ % (before, after)
    
    response_dict = requests.post(LW_GRAPHQL_URL, json={'query': query}).json()
    ids = [item['_id'] for item in response_dict['data']['posts']['results']]
    
    return ids
    
def remove_html(corpus, use_tqdm = True):
    
    cleaner = Cleaner(kill_tags=['script', 'style'], allow_tags=[''])
    clean = lambda text: cleaner.clean_html(lxml.html.fromstring(text)).text
    
    if use_tqdm: corpus = tqdm(corpus)

    return [{**post, 'text': clean(post['text'])} for post in corpus]

def get_n_post_ids(n=100, start=datetime.datetime.now()):
    ids = []
    while len(ids) < n:
        end = start - datetime.timedelta(weeks=1)
        ids += posts_request(before=start.strftime('%Y-%m-%d'), after=end.strftime('%Y-%m-%d'))
        start = end
    return ids

def create_corpus(n=1000, filename='lw_corpus.json'):
    
    print('Getting post IDs...')
    ids = get_n_post_ids(n)

    print('Getting %d posts...' % len(ids))
    raw_corpus = []
    for id_ in tqdm(ids):
        post = None
        try: 
            post = post_request(id_)
        except: pass
        if post: raw_corpus.append(post)
    
    print('Removing HTML tags...')
    corpus = remove_html(raw_corpus)
    
    with open(filename, 'w') as f:
        json.dump(corpus, f)
    
    print('Corpus saved as %s' % filename)
    return corpus


In [6]:
c = create_corpus(10000)

Getting post IDs...


JSONDecodeError: Expecting value: line 1 column 1 (char 0)