In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import time
from datetime import datetime
import re
import random

In [2]:
def get_boards() :
    URL = "https://www.theflatearthsociety.org/forum/index.php"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")

    boards = []
    elements = soup.find_all("a", class_= True)
    for e in elements:
        href=e.get('href')
        if "board" in href:
            print(e.text + ' ' + href)
            boards.append(href)

In [3]:
def get_dataframe():
    
    return pd.DataFrame(columns = ['topic_id','msg_id','user_id','msg_date', 'msg_title', 'msg_text'])


In [12]:
def get_messages(url) :
    
    print(url)
    msgs_df = get_dataframe()
    
    topic_id = url[url.rindex('topic=') + len('topic=') : url.rindex('.')]
    
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser", from_encoding="ISO-8859-1")    

        msg_title = soup.find("h1").text.replace("\n","").replace("\t","")

        rows = soup.find_all("div", class_ = "body_message")

        for row in rows:

                user_info = row.find("h4")
                user_name = user_info.text.replace("\n","")
                user_profile = user_info.find("a").get("href")
                user_id = user_profile[user_profile.rindex('=') + 1 :]

                datetime_str = row.find("div", class_ = "body_content").find("div", class_="smalltext").text
                datetime_str = datetime_str[datetime_str.rindex("on: ") + len("on: ") : ]
                datetime_str = datetime_str.replace(" »", "")
                datetime_obj = datetime.strptime(datetime_str, '%B %d, %Y, %I:%M:%S %p')        

                message = row.find("div", class_ = "inner")
                message_id = message.get("id").replace("msg_", "")

                text = message.get_text(separator = '\n', strip = True)

                quotes = row.find_all(class_ = 'bbc_standard_quote')
                for q in quotes :
                    text = text.replace(q.get_text(separator = '\n', strip = True), '')

                quotes = row.find_all(class_ = 'topslice_quote')
                for q in quotes :
                    text = text.replace(q.get_text(separator = '\n', strip = True), '')

                quotes = row.find_all(class_ = 'bbc_alternate_quote')
                for q in quotes :
                    text = text.replace(q.get_text(separator = '\n', strip = True), '')    

                text = re.sub(r'\n+', '\n', text).strip('\n')
                text = text.replace("\n", " ")

                if len(text) > 20 :
                    msg_df = pd.DataFrame({'topic_id' : topic_id, 'user_id' : user_id, 'msg_id': message_id, 'msg_date': datetime_obj, 'msg_title' : msg_title, 'msg_text' : text }, index=[0])
                    
                    msgs_df = pd.concat([msgs_df if not msgs_df.empty else None, msg_df])
                
    except:
        print("error - " + str(url))
        time.sleep(10)
    
    return msgs_df
    

In [5]:
def get_page_topics(url) :
    
    topics = []
    
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        links = soup.find_all("td", class_= "subject")
        for link in links:
            pages = link.find_all("a")
            for p in pages : 
                href = p.get('href')
                if "topic" in href and href.endswith(".0") :
                    start_page = href
                    next_page = href
                    last_page = href

            pages = link.find_all("a", class_ = "navPages")

            if len(pages) == 2 :    
                next_page = pages[1].get('href')
                last_page = next_page
            elif len(pages) > 2 :
                next_page = pages[1].get('href')
                last_page = pages[len(pages) - 1].get('href')

            step = int(next_page[next_page.rindex('.') + 1 : ]) - int(start_page[start_page.rindex('.') + 1 : ])
            if step > 0 :
                num_pages = int(int(last_page[last_page.rindex('.') + 1 : ]) / step)
            else :
                num_pages = 0
            base_page = start_page[ : start_page.rindex('.') + 1]
            for i in range(num_pages + 1):
                topic = base_page + str(i * step)
                topics.append(topic)
                
    except :
        print("error - " + url)
        time.sleep(10)
        
    return topics


In [6]:
def get_board_pages(url) :
    
    board_pages = []
    
    try :
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        links = soup.find_all("div", class_= "pagelinks floatleft")
        pages = links[0].find_all("a", class_ = "navPages")
        start_page = pages[0].get('href')
        next_page = pages[1].get('href')
        last_page = pages[2].get('href')
        step = int(next_page[next_page.rindex('.') + 1 : ]) - int(start_page[start_page.rindex('.') + 1 : ])
        num_pages = int(int(last_page[last_page.rindex('.') + 1 : ]) / step)
        base_page = start_page[ : start_page.rindex('.') + 1]
        for i in range(num_pages + 1):
            board_page = base_page + str(i * step)
            board_pages.append(board_page)
    
    except :
        print("error - " + url)
        time.sleep(10)
        
    return board_pages
        

In [7]:
def get_pages_board_topics(board):
    
    board_pages = get_board_pages(board)
    board_topic_pages = []
    
    for page in board_pages:
        page_topics = get_page_topics(page)
        board_topic_pages.extend(page_topics)
    
    return board_topic_pages

In [14]:
def scrape_pages(pages, csv_file):
    
    msgs_df = get_dataframe()
    msgs_df.to_csv(csv_file, index=False, header=False)
    
    for page in pages:
        df = get_messages(page)
        df.to_csv(csv_file, mode='a', index=False, header=False)      
        msgs_df = pd.concat([msgs_df if not msgs_df.empty else None, df])
    
    return msgs_df

In [17]:
def scrape_board(name, url, sample = 100.0) :
    
    txt_file = 'flat-earth-data/' + 'Pages ' + name + '.txt'
    csv_file = 'flat-earth-data/' + name + '.csv'
        
    if os.path.exists(txt_file):
        # load from file
        with open(txt_file, 'r') as fp:
            board_pages = fp.read().split("\n")
            
    else:
        # scrape for pages
        board_pages = get_pages_board_topics(url)    
        with open(txt_file, 'w') as fp:
            fp.write('\n'.join(board_pages))

    if sample < 100 :
        board_pages = random.choices(board_pages, k = int((len(board_pages) * sample) / 100))

    msg_df = scrape_pages(board_pages, csv_file)
    
    return msg_df


In [None]:
get_boards()

In [None]:
#Test
msg_df = scrape_board('Technology, Science & Alt Science', 'https://www.theflatearthsociety.org/forum/index.php?board=3.0', 0.1)

In [None]:
sample = 0.1
msg_df_gn = scrape_board('Flat Earth General', 'https://www.theflatearthsociety.org/forum/index.php?board=20.0', sample)
msg_df_qa = scrape_board('Flat Earth Q&A', 'https://www.theflatearthsociety.org/forum/index.php?board=12.0', sample)
msg_df_db = scrape_board('Flat Earth Debate', 'https://www.theflatearthsociety.org/forum/index.php?board=10.0', sample)
msg_df_bl = scrape_board('Flat Earth Believers', 'https://www.theflatearthsociety.org/forum/index.php?board=8.0', sample)
msg_df_pr = scrape_board('Philosophy, Religion & Society', 'https://www.theflatearthsociety.org/forum/index.php?board=11.0', sample)
msg_df_ts = scrape_board('Technology, Science & Alt Science', 'https://www.theflatearthsociety.org/forum/index.php?board=3.0', sample)
