In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import time
from datetime import datetime
import re
import random

In [2]:
def get_boards() :
    URL = "https://www.theflatearthsociety.org/forum/index.php"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")

    boards = []
    elements = soup.find_all("a", class_= True)
    for e in elements:
        href=e.get('href')
        if "board" in href:
            print(e.text + ' ' + href)
            boards.append(href)

In [3]:
def get_dataframe():
    
    return pd.DataFrame(columns = ['topic_id','msg_id','user_id','msg_date', 'msg_title', 'msg_text'])


In [12]:
def get_messages(url) :
    
    print(url)
    msgs_df = get_dataframe()
    
    topic_id = url[url.rindex('topic=') + len('topic=') : url.rindex('.')]
    
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser", from_encoding="ISO-8859-1")    

        msg_title = soup.find("h1").text.replace("\n","").replace("\t","")

        rows = soup.find_all("div", class_ = "body_message")

        for row in rows:

                user_info = row.find("h4")
                user_name = user_info.text.replace("\n","")
                user_profile = user_info.find("a").get("href")
                user_id = user_profile[user_profile.rindex('=') + 1 :]

                datetime_str = row.find("div", class_ = "body_content").find("div", class_="smalltext").text
                datetime_str = datetime_str[datetime_str.rindex("on: ") + len("on: ") : ]
                datetime_str = datetime_str.replace(" »", "")
                datetime_obj = datetime.strptime(datetime_str, '%B %d, %Y, %I:%M:%S %p')        

                message = row.find("div", class_ = "inner")
                message_id = message.get("id").replace("msg_", "")

                text = message.get_text(separator = '\n', strip = True)

                quotes = row.find_all(class_ = 'bbc_standard_quote')
                for q in quotes :
                    text = text.replace(q.get_text(separator = '\n', strip = True), '')

                quotes = row.find_all(class_ = 'topslice_quote')
                for q in quotes :
                    text = text.replace(q.get_text(separator = '\n', strip = True), '')

                quotes = row.find_all(class_ = 'bbc_alternate_quote')
                for q in quotes :
                    text = text.replace(q.get_text(separator = '\n', strip = True), '')    

                text = re.sub(r'\n+', '\n', text).strip('\n')
                text = text.replace("\n", " ")

                if len(text) > 20 :
                    msg_df = pd.DataFrame({'topic_id' : topic_id, 'user_id' : user_id, 'msg_id': message_id, 'msg_date': datetime_obj, 'msg_title' : msg_title, 'msg_text' : text }, index=[0])
                    
                    msgs_df = pd.concat([msgs_df if not msgs_df.empty else None, msg_df])
                
    except:
        print("error - " + str(url))
        time.sleep(10)
    
    return msgs_df
    

In [5]:
def get_page_topics(url) :
    
    topics = []
    
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        links = soup.find_all("td", class_= "subject")
        for link in links:
            pages = link.find_all("a")
            for p in pages : 
                href = p.get('href')
                if "topic" in href and href.endswith(".0") :
                    start_page = href
                    next_page = href
                    last_page = href

            pages = link.find_all("a", class_ = "navPages")

            if len(pages) == 2 :    
                next_page = pages[1].get('href')
                last_page = next_page
            elif len(pages) > 2 :
                next_page = pages[1].get('href')
                last_page = pages[len(pages) - 1].get('href')

            step = int(next_page[next_page.rindex('.') + 1 : ]) - int(start_page[start_page.rindex('.') + 1 : ])
            if step > 0 :
                num_pages = int(int(last_page[last_page.rindex('.') + 1 : ]) / step)
            else :
                num_pages = 0
            base_page = start_page[ : start_page.rindex('.') + 1]
            for i in range(num_pages + 1):
                topic = base_page + str(i * step)
                topics.append(topic)
                
    except :
        print("error - " + url)
        time.sleep(10)
        
    return topics


In [6]:
def get_board_pages(url) :
    
    board_pages = []
    
    try :
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        links = soup.find_all("div", class_= "pagelinks floatleft")
        pages = links[0].find_all("a", class_ = "navPages")
        start_page = pages[0].get('href')
        next_page = pages[1].get('href')
        last_page = pages[2].get('href')
        step = int(next_page[next_page.rindex('.') + 1 : ]) - int(start_page[start_page.rindex('.') + 1 : ])
        num_pages = int(int(last_page[last_page.rindex('.') + 1 : ]) / step)
        base_page = start_page[ : start_page.rindex('.') + 1]
        for i in range(num_pages + 1):
            board_page = base_page + str(i * step)
            board_pages.append(board_page)
    
    except :
        print("error - " + url)
        time.sleep(10)
        
    return board_pages
        

In [7]:
def get_pages_board_topics(board):
    
    board_pages = get_board_pages(board)
    board_topic_pages = []
    
    for page in board_pages:
        page_topics = get_page_topics(page)
        board_topic_pages.extend(page_topics)
    
    return board_topic_pages

In [14]:
def scrape_pages(pages, csv_file):
    
    msgs_df = get_dataframe()
    msgs_df.to_csv(csv_file, index=False, header=False)
    
    for page in pages:
        df = get_messages(page)
        df.to_csv(csv_file, mode='a', index=False, header=False)      
        msgs_df = pd.concat([msgs_df if not msgs_df.empty else None, df])
    
    return msgs_df

In [17]:
def scrape_board(name, url, sample = 100.0) :
    
    txt_file = 'flat-earth-data/' + 'Pages ' + name + '.txt'
    csv_file = 'flat-earth-data/' + name + '.csv'
        
    if os.path.exists(txt_file):
        # load from file
        with open(txt_file, 'r') as fp:
            board_pages = fp.read().split("\n")
            
    else:
        # scrape for pages
        board_pages = get_pages_board_topics(url)    
        with open(txt_file, 'w') as fp:
            fp.write('\n'.join(board_pages))

    if sample < 100 :
        board_pages = random.choices(board_pages, k = int((len(board_pages) * sample) / 100))

    msg_df = scrape_pages(board_pages, csv_file)
    
    return msg_df


In [10]:
get_boards()

Announcements https://www.theflatearthsociety.org/forum/index.php?board=15.0
Suggestions & Concerns https://www.theflatearthsociety.org/forum/index.php?board=18.0
Flat Earth General https://www.theflatearthsociety.org/forum/index.php?board=20.0
Flat Earth Q&A https://www.theflatearthsociety.org/forum/index.php?board=12.0
Flat Earth Debate https://www.theflatearthsociety.org/forum/index.php?board=10.0
Flat Earth Believers https://www.theflatearthsociety.org/forum/index.php?board=8.0
Flat Earth Information Repository https://www.theflatearthsociety.org/forum/index.php?board=2.0
The Lounge https://www.theflatearthsociety.org/forum/index.php?board=7.0
Arts & Entertainment https://www.theflatearthsociety.org/forum/index.php?board=16.0
Philosophy, Religion & Society https://www.theflatearthsociety.org/forum/index.php?board=11.0
Technology, Science & Alt Science https://www.theflatearthsociety.org/forum/index.php?board=3.0


In [15]:
#Test
msg_df = scrape_board('Technology, Science & Alt Science', 'https://www.theflatearthsociety.org/forum/index.php?board=3.0', 0.1)

https://www.theflatearthsociety.org/forum/index.php?topic=83616.0
https://www.theflatearthsociety.org/forum/index.php?topic=62555.9960
https://www.theflatearthsociety.org/forum/index.php?topic=55422.90
https://www.theflatearthsociety.org/forum/index.php?topic=1282.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=1282.0
https://www.theflatearthsociety.org/forum/index.php?topic=11293.4110


In [18]:
sample = 0.1
msg_df_gn = scrape_board('Flat Earth General', 'https://www.theflatearthsociety.org/forum/index.php?board=20.0', sample)
msg_df_qa = scrape_board('Flat Earth Q&A', 'https://www.theflatearthsociety.org/forum/index.php?board=12.0', sample)
msg_df_db = scrape_board('Flat Earth Debate', 'https://www.theflatearthsociety.org/forum/index.php?board=10.0', sample)
msg_df_bl = scrape_board('Flat Earth Believers', 'https://www.theflatearthsociety.org/forum/index.php?board=8.0', sample)
msg_df_pr = scrape_board('Philosophy, Religion & Society', 'https://www.theflatearthsociety.org/forum/index.php?board=11.0', sample)
msg_df_ts = scrape_board('Technology, Science & Alt Science', 'https://www.theflatearthsociety.org/forum/index.php?board=3.0', sample)


https://www.theflatearthsociety.org/forum/index.php?topic=69928.0
https://www.theflatearthsociety.org/forum/index.php?topic=39656.0
https://www.theflatearthsociety.org/forum/index.php?topic=1043.0
https://www.theflatearthsociety.org/forum/index.php?topic=22053.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=22053.0


  msgs_df = pd.concat([msgs_df if not msgs_df.empty else None, df])


https://www.theflatearthsociety.org/forum/index.php?topic=11424.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=11424.0
https://www.theflatearthsociety.org/forum/index.php?topic=54320.60
https://www.theflatearthsociety.org/forum/index.php?topic=18477.0
https://www.theflatearthsociety.org/forum/index.php?topic=11631.30
https://www.theflatearthsociety.org/forum/index.php?topic=17207.0
https://www.theflatearthsociety.org/forum/index.php?topic=29381.0
https://www.theflatearthsociety.org/forum/index.php?topic=52058.0
https://www.theflatearthsociety.org/forum/index.php?topic=33566.0
https://www.theflatearthsociety.org/forum/index.php?topic=15877.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=15877.0
https://www.theflatearthsociety.org/forum/index.php?topic=61754.30
https://www.theflatearthsociety.org/forum/index.php?topic=31314.0
https://www.theflatearthsociety.org/forum/index.php?topic=43189.0
error - https://www.theflatearthsociety.org/forum/index.p

  msgs_df = pd.concat([msgs_df if not msgs_df.empty else None, df])


https://www.theflatearthsociety.org/forum/index.php?topic=70968.420
https://www.theflatearthsociety.org/forum/index.php?topic=63428.0
https://www.theflatearthsociety.org/forum/index.php?topic=85671.90
https://www.theflatearthsociety.org/forum/index.php?topic=61352.0
https://www.theflatearthsociety.org/forum/index.php?topic=21878.0
https://www.theflatearthsociety.org/forum/index.php?topic=61836.330
https://www.theflatearthsociety.org/forum/index.php?topic=49250.0
https://www.theflatearthsociety.org/forum/index.php?topic=17872.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=17872.0


  msgs_df = pd.concat([msgs_df if not msgs_df.empty else None, df])


https://www.theflatearthsociety.org/forum/index.php?topic=74750.0
https://www.theflatearthsociety.org/forum/index.php?topic=44128.90
https://www.theflatearthsociety.org/forum/index.php?topic=51323.60
https://www.theflatearthsociety.org/forum/index.php?topic=61044.90
https://www.theflatearthsociety.org/forum/index.php?topic=24959.30
https://www.theflatearthsociety.org/forum/index.php?topic=21018.0
https://www.theflatearthsociety.org/forum/index.php?topic=67119.120
https://www.theflatearthsociety.org/forum/index.php?topic=64014.1110
https://www.theflatearthsociety.org/forum/index.php?topic=45647.60
https://www.theflatearthsociety.org/forum/index.php?topic=36793.0
https://www.theflatearthsociety.org/forum/index.php?topic=57067.30
https://www.theflatearthsociety.org/forum/index.php?topic=36696.0
https://www.theflatearthsociety.org/forum/index.php?topic=63643.0
https://www.theflatearthsociety.org/forum/index.php?topic=30827.120
https://www.theflatearthsociety.org/forum/index.php?topic=67313

https://www.theflatearthsociety.org/forum/index.php?topic=66683.0
https://www.theflatearthsociety.org/forum/index.php?topic=33115.0
https://www.theflatearthsociety.org/forum/index.php?topic=78419.30
https://www.theflatearthsociety.org/forum/index.php?topic=68822.0
https://www.theflatearthsociety.org/forum/index.php?topic=50313.0
https://www.theflatearthsociety.org/forum/index.php?topic=70200.120
https://www.theflatearthsociety.org/forum/index.php?topic=61743.0
https://www.theflatearthsociety.org/forum/index.php?topic=77905.0
https://www.theflatearthsociety.org/forum/index.php?topic=73003.60
https://www.theflatearthsociety.org/forum/index.php?topic=68163.0
https://www.theflatearthsociety.org/forum/index.php?topic=62278.30
https://www.theflatearthsociety.org/forum/index.php?topic=67362.0
https://www.theflatearthsociety.org/forum/index.php?topic=62494.210
https://www.theflatearthsociety.org/forum/index.php?topic=74672.0
https://www.theflatearthsociety.org/forum/index.php?topic=48676.90
ht

error - https://www.theflatearthsociety.org/forum/index.php?topic=49299.60
https://www.theflatearthsociety.org/forum/index.php?topic=71164.30
https://www.theflatearthsociety.org/forum/index.php?topic=60179.0
https://www.theflatearthsociety.org/forum/index.php?topic=66364.0
https://www.theflatearthsociety.org/forum/index.php?topic=87272.90
https://www.theflatearthsociety.org/forum/index.php?topic=91521.0
https://www.theflatearthsociety.org/forum/index.php?topic=64824.30
https://www.theflatearthsociety.org/forum/index.php?topic=53535.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=53535.0
https://www.theflatearthsociety.org/forum/index.php?topic=35939.0
https://www.theflatearthsociety.org/forum/index.php?topic=68677.150
https://www.theflatearthsociety.org/forum/index.php?topic=64102.30
https://www.theflatearthsociety.org/forum/index.php?topic=68636.240
https://www.theflatearthsociety.org/forum/index.php?topic=82299.90
https://www.theflatearthsociety.org/forum/index.ph

https://www.theflatearthsociety.org/forum/index.php?topic=56536.0
https://www.theflatearthsociety.org/forum/index.php?topic=83445.30
https://www.theflatearthsociety.org/forum/index.php?topic=73277.0
https://www.theflatearthsociety.org/forum/index.php?topic=68281.0
https://www.theflatearthsociety.org/forum/index.php?topic=60218.0
https://www.theflatearthsociety.org/forum/index.php?topic=74422.0
https://www.theflatearthsociety.org/forum/index.php?topic=62276.0
https://www.theflatearthsociety.org/forum/index.php?topic=68654.1260
https://www.theflatearthsociety.org/forum/index.php?topic=69842.0
https://www.theflatearthsociety.org/forum/index.php?topic=37358.30
error - https://www.theflatearthsociety.org/forum/index.php?topic=37358.30
https://www.theflatearthsociety.org/forum/index.php?topic=70646.0
https://www.theflatearthsociety.org/forum/index.php?topic=66725.900
https://www.theflatearthsociety.org/forum/index.php?topic=34593.60
error - https://www.theflatearthsociety.org/forum/index.php

https://www.theflatearthsociety.org/forum/index.php?topic=65007.0
https://www.theflatearthsociety.org/forum/index.php?topic=87698.30
https://www.theflatearthsociety.org/forum/index.php?topic=64284.0
https://www.theflatearthsociety.org/forum/index.php?topic=61565.0
https://www.theflatearthsociety.org/forum/index.php?topic=55307.30
https://www.theflatearthsociety.org/forum/index.php?topic=72882.60
https://www.theflatearthsociety.org/forum/index.php?topic=63099.30
https://www.theflatearthsociety.org/forum/index.php?topic=55663.0
https://www.theflatearthsociety.org/forum/index.php?topic=34697.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=34697.0
https://www.theflatearthsociety.org/forum/index.php?topic=74767.0
https://www.theflatearthsociety.org/forum/index.php?topic=66077.0
https://www.theflatearthsociety.org/forum/index.php?topic=63886.0
https://www.theflatearthsociety.org/forum/index.php?topic=38912.90
https://www.theflatearthsociety.org/forum/index.php?topic=50823

https://www.theflatearthsociety.org/forum/index.php?topic=73889.60
https://www.theflatearthsociety.org/forum/index.php?topic=59823.30
https://www.theflatearthsociety.org/forum/index.php?topic=61880.90
https://www.theflatearthsociety.org/forum/index.php?topic=86036.870
https://www.theflatearthsociety.org/forum/index.php?topic=73832.0
https://www.theflatearthsociety.org/forum/index.php?topic=87127.600
https://www.theflatearthsociety.org/forum/index.php?topic=70710.300
https://www.theflatearthsociety.org/forum/index.php?topic=53883.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=53883.0
https://www.theflatearthsociety.org/forum/index.php?topic=68164.240
https://www.theflatearthsociety.org/forum/index.php?topic=62708.0
https://www.theflatearthsociety.org/forum/index.php?topic=64288.0
https://www.theflatearthsociety.org/forum/index.php?topic=33472.30
https://www.theflatearthsociety.org/forum/index.php?topic=77398.30
https://www.theflatearthsociety.org/forum/index.php?top

https://www.theflatearthsociety.org/forum/index.php?topic=43150.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=43150.0
https://www.theflatearthsociety.org/forum/index.php?topic=39365.0
https://www.theflatearthsociety.org/forum/index.php?topic=69381.0
https://www.theflatearthsociety.org/forum/index.php?topic=68458.0
https://www.theflatearthsociety.org/forum/index.php?topic=64156.0
https://www.theflatearthsociety.org/forum/index.php?topic=92164.30
https://www.theflatearthsociety.org/forum/index.php?topic=77228.30
https://www.theflatearthsociety.org/forum/index.php?topic=90770.150
https://www.theflatearthsociety.org/forum/index.php?topic=79944.0
https://www.theflatearthsociety.org/forum/index.php?topic=77155.0
https://www.theflatearthsociety.org/forum/index.php?topic=63361.0
https://www.theflatearthsociety.org/forum/index.php?topic=61973.0
https://www.theflatearthsociety.org/forum/index.php?topic=36406.0
https://www.theflatearthsociety.org/forum/index.php?topic=54468.

error - https://www.theflatearthsociety.org/forum/index.php?topic=34937.30
https://www.theflatearthsociety.org/forum/index.php?topic=62143.60
https://www.theflatearthsociety.org/forum/index.php?topic=86675.30
https://www.theflatearthsociety.org/forum/index.php?topic=72255.0
https://www.theflatearthsociety.org/forum/index.php?topic=77293.0
https://www.theflatearthsociety.org/forum/index.php?topic=54133.0
https://www.theflatearthsociety.org/forum/index.php?topic=73579.0
https://www.theflatearthsociety.org/forum/index.php?topic=59288.0
https://www.theflatearthsociety.org/forum/index.php?topic=51457.0
https://www.theflatearthsociety.org/forum/index.php?topic=62231.330
https://www.theflatearthsociety.org/forum/index.php?topic=48087.30
error - https://www.theflatearthsociety.org/forum/index.php?topic=48087.30
https://www.theflatearthsociety.org/forum/index.php?topic=92207.60
https://www.theflatearthsociety.org/forum/index.php?topic=68804.60
https://www.theflatearthsociety.org/forum/index.php

https://www.theflatearthsociety.org/forum/index.php?topic=32769.0
https://www.theflatearthsociety.org/forum/index.php?topic=20588.30
https://www.theflatearthsociety.org/forum/index.php?topic=5394.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=5394.0
https://www.theflatearthsociety.org/forum/index.php?topic=66639.0
https://www.theflatearthsociety.org/forum/index.php?topic=61215.390
https://www.theflatearthsociety.org/forum/index.php?topic=49769.0
https://www.theflatearthsociety.org/forum/index.php?topic=6503.0
https://www.theflatearthsociety.org/forum/index.php?topic=60166.780
https://www.theflatearthsociety.org/forum/index.php?topic=14619.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=14619.0
https://www.theflatearthsociety.org/forum/index.php?topic=8922.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=8922.0
https://www.theflatearthsociety.org/forum/index.php?topic=22057.0
error - https://www.theflatearthsociety.org/forum/in

https://www.theflatearthsociety.org/forum/index.php?topic=55155.0
https://www.theflatearthsociety.org/forum/index.php?topic=14414.30
https://www.theflatearthsociety.org/forum/index.php?topic=50488.0
error - https://www.theflatearthsociety.org/forum/index.php?topic=50488.0
https://www.theflatearthsociety.org/forum/index.php?topic=7268.0
https://www.theflatearthsociety.org/forum/index.php?topic=68330.0
https://www.theflatearthsociety.org/forum/index.php?topic=60166.30
error - https://www.theflatearthsociety.org/forum/index.php?topic=60166.30
https://www.theflatearthsociety.org/forum/index.php?topic=10839.30
https://www.theflatearthsociety.org/forum/index.php?topic=30672.30
https://www.theflatearthsociety.org/forum/index.php?topic=55575.0
https://www.theflatearthsociety.org/forum/index.php?topic=28431.120
https://www.theflatearthsociety.org/forum/index.php?topic=61037.30
https://www.theflatearthsociety.org/forum/index.php?topic=15634.0
https://www.theflatearthsociety.org/forum/index.php?t

KeyboardInterrupt: 