In [77]:
import datetime
import json
import lxml.html
from lxml.html.clean import Cleaner
import requests
import time
from  tqdm import tqdm

LW_API_URL = 'https://www.lesswrong.com/graphql'

cleaner = Cleaner(allow_tags=[''], kill_tags=['style', 'script'])
def clean_html(text):
    return cleaner.clean_html(lxml.html.fromstring(text)).text
    
def get_posts(before, span = datetime.timedelta(weeks=1), clean=True):
    
    after = before - span
    
    before_s = before.strftime('%Y-%m-%d')
    after_s = after.strftime('%Y-%m-%d')
    
    posts_query = """
    {
      posts(input: {
        terms: {
          view: "new"
          meta: null  
          before: "%s"
          after: "%s"
        }
      }) {
        results {
          htmlBody
          title
          baseScore
          af
          _id
          userId
        }
      }
    }
    """ % (before_s, after_s)
    
    response = requests.post(LW_API_URL, json = {'query': posts_query})
    
    try: response_data = response.json()['data']['posts']['results']
    except: return []

    posts = [convert_post(post, clean=clean) for post in response_data]
    
    return posts

def convert_post(post, clean=True):
    key_mapping = {
        'htmlBody': 'text',
        'title': 'title',
        'baseScore': 'karma',
        'af': 'af',
        '_id': 'id',
        'userId': 'userid'
    }
    
    converted = {v: post[k] for k, v in key_mapping.items()}
    if clean: 
        try: converted['text'] = clean_html(converted['text'])
        except: pass
    
    return converted

def create_corpus(n_posts=100, filename='lw_corpus.json', span=datetime.timedelta(weeks=1), clean=True, start=None, sleep=0):
    if not start: start = datetime.datetime.now()
    
    posts = []
    
    with tqdm(total=n_posts) as pbar:
        while len(posts) < n_posts:
            if sleep: time.sleep(sleep)
            new_posts = get_posts(start, clean=clean)
            posts += new_posts
            start = start - span
            pbar.update(len(new_posts))

    with open(filename, 'w') as f:
        json.dump(posts, f)
        
    return posts

In [78]:
corpus = create_corpus(10000, span=datetime.timedelta(weeks=2), sleep=1)

10006it [09:09, 18.21it/s]                                                


In [76]:
posts = get_posts(datetime.datetime.now())

In [66]:
57 / (1/26)**3

1001831.9999999999

In [61]:
posts

[{'text': 'p   >   T   h   i   s       i   s       a       t   r   a   n   s   c   r   i   p   t       o   f       a       c   o   n   v   e   r   s   a   t   i   o   n       b   e   t   w   e   e   n       P   a   u   l       C   h   r   i   s   t   i   a   n   o       a   n   d       E   l   i   e   z   e   r       Y   u   d   k   o   w   s   k   y   ,       w   i   t   h       c   o   m   m   e   n   t   s       b   y       R   o   h   i   n       S   h   a   h   ,       B   e   t   h       B   a   r   n   e   s   ,       R   i   c   h   a   r   d       N   g   o   ,       a   n   d       H   o   l   d   e   n       K   a   r   n   o   f   s   k   y   ,       c   o   n   t   i   n   u   i   n   g       t   h   e       ',
  'title': 'Christiano and Yudkowsky on AI predictions and human intelligence',
  'karma': 45,
  'af': True,
  'id': 'NbGmfxbaABPsspib7',
  'userid': 'nmk3nLpQE89dMRzzN'},
 {'text': '"If this is a trap then They are coming," said Luna. She used the word "They" to av

In [27]:
data = response.json()

In [28]:
len(data['data']['posts']['results'])

50

In [30]:
post = data['data']['posts']['results'][0]

In [50]:
convert_post(post)

{'text': 'p   >   T   h   i   s       i   s       a       t   r   a   n   s   c   r   i   p   t       o   f       a       c   o   n   v   e   r   s   a   t   i   o   n       b   e   t   w   e   e   n       P   a   u   l       C   h   r   i   s   t   i   a   n   o       a   n   d       E   l   i   e   z   e   r       Y   u   d   k   o   w   s   k   y   ,       w   i   t   h       c   o   m   m   e   n   t   s       b   y       R   o   h   i   n       S   h   a   h   ,       B   e   t   h       B   a   r   n   e   s   ,       R   i   c   h   a   r   d       N   g   o   ,       a   n   d       H   o   l   d   e   n       K   a   r   n   o   f   s   k   y   ,       c   o   n   t   i   n   u   i   n   g       t   h   e       ',
 'title': 'Christiano and Yudkowsky on AI predictions and human intelligence',
 'karma': 45,
 'af': True,
 'id': 'NbGmfxbaABPsspib7',
 'userid': 'nmk3nLpQE89dMRzzN'}

In [37]:
text = post['htmlBody']
e = cleaner.clean_html(lxml.html.fromstring(text))

In [43]:
e.text

IndexError: list index out of range

In [23]:
clean_html(response.content)

'\n\n  \n  GraphiQL\n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n\n\n  \n\n'

In [22]:
response.status_code

200

In [17]:
response.request

<PreparedRequest [POST]>

In [42]:
clean_html('<style>remove me</style><p>Hello!</p>')

'Hello!'

In [8]:
datetime.datetime.now().strftime('%Y-%m-%d')

'2022-02-24'

In [45]:
clean_html('<p>This is a transcript of a conversation between Paul Christiano and Eliezer Yudkowsky</p>')

'This is a transcript of a conversation between Paul Christiano and Eliezer Yudkowsky'