In [9]:
"""
A minimal Bitcointalk recent posts scraper.
It scrapes the specified number of posts from:
https://bitcointalk.org/index.php?action=recent
and outputs a dataframe with columns:
TIME, POST, SECTION, SUBSECTION, TOPIC.
"""

import time
import csv
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

def scrape_recent_posts(num_posts):
    """
    Scrapes recent posts from Bitcointalk and saves them to a CSV.
    
    :param num_posts: Number of posts to scrape.
    """
    # Configure headless browser
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    
    # Initialize ChromeDriver using webdriver_manager
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )
    
    posts = []
    page_count = 0
    
    # Continue scraping pages until we've collected the required number of posts
    while len(posts) < num_posts:
        offset = page_count * 10
        url = f"https://bitcointalk.org/index.php?action=recent;start={offset}"
        driver.get(url)
        time.sleep(3)  # Allow time for the page to load
        
        # Locate all tables inside #bodyarea that might represent posts.
        tables = driver.find_elements(By.CSS_SELECTOR, "#bodyarea > table")
        
        for table in tables:
            # We assume that each post container has a header row with class "titlebg2"
            try:
                header = table.find_element(By.CSS_SELECTOR, "tr.titlebg2")
            except Exception:
                continue  # Skip if this table doesn't contain a header row
            
            # Extract data using CSS selectors.
            try:
                time_text = table.find_element(By.CSS_SELECTOR, "tr.titlebg2 > td > div:nth-child(3)").text.strip()
            except Exception:
                time_text = "No time"
            try:
                section_text = table.find_element(By.CSS_SELECTOR, "tr.titlebg2 > td > div:nth-child(2) > a:nth-child(1)").text.strip()
            except Exception:
                section_text = "No section"
            try:
                subsection_text = table.find_element(By.CSS_SELECTOR, "tr.titlebg2 > td > div:nth-child(2) > a:nth-child(2)").text.strip()
            except Exception:
                subsection_text = "No subsection"
            try:
                topic_text = table.find_element(By.CSS_SELECTOR, "tr.titlebg2 > td > div:nth-child(2) > b > a").text.strip()
            except Exception:
                topic_text = "No topic"
            try:
                post_text = table.find_element(By.CSS_SELECTOR, "tr:nth-child(3) > td").text.strip()
            except Exception:
                post_text = "No post"
            
            posts.append({
                "time": time_text,
                "post": post_text,
                "section": section_text,
                "subsection": subsection_text,
                "topic": topic_text
            })
            
            if len(posts) >= num_posts:
                break
        
        page_count += 1
        print(f"Currenly scraped: {page_count*10}")

    driver.quit()

    df = pd.DataFrame(posts)

    return df

In [10]:
num_posts = 10
df_posts = scrape_recent_posts(num_posts)

Currenly scraped: 10


In [11]:
df_posts.info()
df_posts.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   time        10 non-null     object
 1   post        10 non-null     object
 2   section     10 non-null     object
 3   subsection  10 non-null     object
 4   topic       10 non-null     object
dtypes: object(5)
memory usage: 532.0+ bytes


Unnamed: 0,time,post,section,subsection,topic
0,on: Today at 10:40:56 PM,Quote from: darxiaomi on Today at 10:35:12 PM\...,Economy,Gambling discussion,Re: ⚽ Sportsbet.io ⚽ Bundesliga ⚽ Football Poo...
1,on: Today at 10:40:04 PM,"Quote from: hafiztalha on March 07, 2025, 10:5...",Economy,Gambling discussion,Re: T20 and T20I cricket prediction and discus...
2,on: Today at 10:39:48 PM,Quote from: kTimesG on Today at 10:08:38 PM\ns...,Bitcoin,Bitcoin Discussion,Re: Bitcoin puzzle transaction ~32 BTC prize t...
3,on: Today at 10:39:43 PM,I’ve heard a lot of gamblers winning it big fr...,Economy,Gambling discussion,Re: ANYONE here makes a living in gambling?
4,on: Today at 10:39:35 PM,Quote from: MrJoeRubbish on Today at 09:22:07 ...,Local,Other languages/locations,Re: বাংলা (Bengali)
