# Web Scrape From Bitcoin Forum

In [7]:
import time
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException


In [8]:
def parse_timestamp(timestamp_str):
    """
    Parse a timestamp string (e.g., "January 02, 2024, 02:36:13 PM")
    into a datetime object. Adjust the format string if needed.
    """
    try:
        return datetime.strptime(timestamp_str, "%B %d, %Y, %I:%M:%S %p")
    except Exception:
        return None

In [9]:
def scrape_topic_posts(driver, topic_url, subject_title, subsection):
    """
    Given a topic URL, visits the page (with ;all appended) and extracts all posts.
    For each post, extracts the timestamp and post content using the provided Relative XPath.
    Returns a list of dictionaries with keys: 'timestamp', 'post', 'subject', 'subsection'.
    """
    full_url = topic_url
    driver.get(full_url)
    time.sleep(3)  # Allow time for the page to load

    topic_posts = []
    # Locate all post containers within #quickModForm
    posts = driver.find_elements(
        "xpath", "//*[@id='quickModForm']/table[contains(@class,'bordercolor')]/tbody/tr"
    )

    if not posts:
        print("No posts found on topic page:", full_url)

    for p in posts:
        try:
            # Extract timestamp using the provided Relative XPath
            timestamp_text = p.find_element(
                "xpath", ".//td[2]/table/tbody/tr/td[2]/div[2]"
            ).text.strip()
            timestamp_obj = datetime.strptime(timestamp_text, "%B %d, %Y, %I:%M:%S %p")
            timestamp_text = timestamp_obj.strftime("%Y-%m-%d")
        except NoSuchElementException:
            timestamp_text = "No timestamp"

        try:
            # Extract post content using a more precise XPath
            post_text = p.find_element(
                "xpath", ".//td[2]/div[@class='post']"
            ).text.strip()
        except NoSuchElementException:
            post_text = "No content"

        topic_posts.append({
            "timestamp": timestamp_text,
            "post": post_text,
            "subject": subject_title,
            "subsection": subsection
        })

    return topic_posts


In [10]:
def scrape_subsection_topics(num_pages, board_prefix, start_subject_num, start_date_str, 
                             end_date_str, keywords):
    """
    Scrapes topics from a specific board (subsection) page that meet criteria:
      - The topic's last post timestamp is within the specified date range.
      - The topic title contains the given keyword (case-insensitive).
    
    For each matching topic, the function visits the topic's URL (with ;all appended)
    and extracts all posts (timestamp and post content).
    """
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # Configure headless Chrome browser
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )
    
    all_posts = []
    
    # Loop over board pages. Each page URL is formed using the subject_start_num (which increments by 40).
    for page in range(num_pages):
        subject_start_num = start_subject_num + page * 40
        board_url = f"{board_prefix}{subject_start_num}"
        print(f"Accessing board page: {board_url}")
        driver.get(board_url)
        time.sleep(3)
        
        # Extract topic rows from the board page.
        # The CSS selectors here assume that each topic is in a table row within a table with class "bordercolor".
        topic_rows = driver.find_elements(By.CSS_SELECTOR, "#bodyarea table.bordercolor tr")
        
        for row in topic_rows:
            try:
                # Locate the topic link element
                link = row.find_element(By.CSS_SELECTOR, "td a[href*='topic=']")
                title = link.text.strip()
                topic_url = link.get_attribute("href")
                
                # Locate the element that holds the last post timestamp.
                last_post_elem = row.find_element(By.CSS_SELECTOR, "td.windowbg2.lastpostcol span")
                # Extract the timestamp string (taking only the first line)
                last_post_text = last_post_elem.text.strip().split("\n")[0]
                topic_last_post = parse_timestamp(last_post_text)
                if topic_last_post is None:
                    continue
                
                # Filter topics by date range
                if not (start_date <= topic_last_post <= end_date):
                    continue
                
                # Filter by keyword (case-insensitive)
                #if keyword.lower() not in title.lower():
                #    subsection = keyword
                #    continue
                matched_coin = None
                for abbr, full_name in keywords.items():
                    if abbr.lower() in title.lower() or full_name.lower() in title.lower():
                        matched_coin = abbr  # Assign the abbreviation (e.g., "BTC")
                        break  # Stop checking after the first match
                
                # If no keyword match, skip this topic
                if not matched_coin:
                    continue
                
                subsection = matched_coin
                
                # Here, we set the subsection name. You might extract this differently.
                print(f"Scraping topic: {title} | Last post: {last_post_text}")
                
                # Scrape all posts from the topic
                topic_posts = scrape_topic_posts(driver, topic_url, title, subsection)
                all_posts.extend(topic_posts)
            except Exception as e:
                # If any error occurs in processing a row, skip it.
                continue
    
    driver.quit()
    df = pd.DataFrame(all_posts)
    return df

Due to the stucture of the website, bitcoin and alternative coins are scraped seperatly.

In [11]:
bitcoin = {
    "BTC": "Bitcoin"
}

df_bitcoin_posts = scrape_subsection_topics(num_pages=50,
                                           board_prefix="https://bitcointalk.org/index.php?board=1.",
                                           start_subject_num=380, 
                                           start_date_str="2024-01-01", 
                                           end_date_str="2024-12-31", 
                                           keywords=bitcoin)


Accessing board page: https://bitcointalk.org/index.php?board=1.380
Scraping topic: Has Bitcoin met your expectations from the time you bought your Bitcoin till now | Last post: December 30, 2024, 07:35:51 PM
Accessing board page: https://bitcointalk.org/index.php?board=1.420
Scraping topic: Bitcoin is taxed up to 110% in Japan | Last post: December 30, 2024, 09:56:36 AM
Accessing board page: https://bitcointalk.org/index.php?board=1.460
Scraping topic: Is bitcoin slowly dying? | Last post: December 20, 2024, 02:31:47 AM
Accessing board page: https://bitcointalk.org/index.php?board=1.500
Scraping topic: one reason not to buy BitCoin | Last post: December 12, 2024, 02:03:07 PM
Accessing board page: https://bitcointalk.org/index.php?board=1.540
Scraping topic: BTC finally touch 100k+ milestone | Last post: December 05, 2024, 11:41:43 PM
Accessing board page: https://bitcointalk.org/index.php?board=1.580
Scraping topic: Bitcoin now more costly than a kilogram of gold | Last post: November

In [12]:
df_bitcoin_posts.info()
df_bitcoin_posts.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 868 entries, 0 to 867
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   timestamp   868 non-null    object
 1   post        868 non-null    object
 2   subject     868 non-null    object
 3   subsection  868 non-null    object
dtypes: object(4)
memory usage: 27.3+ KB


Unnamed: 0,timestamp,post,subject,subsection
0,2024-11-05,You've been holding Bitcoin for many years and...,Has Bitcoin met your expectations from the tim...,BTC
1,2024-11-05,Bitcoin has indeed exceeded my expectations. N...,Has Bitcoin met your expectations from the tim...,BTC
2,2024-11-05,I would say Bitcoin has exceeded my expectatio...,Has Bitcoin met your expectations from the tim...,BTC
3,2024-11-05,"Quote from: Davidvictorson on November 05, 202...",Has Bitcoin met your expectations from the tim...,BTC
4,2024-11-05,"Quote from: Davidvictorson on November 05, 202...",Has Bitcoin met your expectations from the tim...,BTC


In [14]:
altcoin_namelist = {
    "ETH": "Ethereum",
    "XRP": "Ripple",
    "SOL": "Solana",
    "DOGE": "Dogecoin",
    "ADA": "Cardano",
    "TRX": "TRON",
    "XLM": "Stellar",
    "AVAX": "Avalanche",
    "SHIB": "Shiba Inu"
}

df_altcoin_posts = scrape_subsection_topics(num_pages=50, 
                                           board_prefix="https://bitcointalk.org/index.php?board=67.",
                                           start_subject_num=400, 
                                           start_date_str="2024-01-01", 
                                           end_date_str="2024-12-31", 
                                           keywords=altcoin_namelist)  # Pass as a dictionary

Accessing board page: https://bitcointalk.org/index.php?board=67.400
Accessing board page: https://bitcointalk.org/index.php?board=67.440
Scraping topic: Does intraday Trading actually works? | Last post: December 30, 2024, 09:16:21 PM
Accessing board page: https://bitcointalk.org/index.php?board=67.480
Scraping topic: ðŸŸ£INOFFICIAL SOLANA THREAD ðŸŸ£ [ Discuss everything about Solana here ] | Last post: December 23, 2024, 03:13:12 AM
Accessing board page: https://bitcointalk.org/index.php?board=67.520
Scraping topic: Ethereum fees | Last post: December 16, 2024, 10:23:42 AM
Accessing board page: https://bitcointalk.org/index.php?board=67.560
Scraping topic: Tron New ATH | Last post: December 09, 2024, 07:18:04 PM
Accessing board page: https://bitcointalk.org/index.php?board=67.600
Scraping topic: The Ethereum killer has been killed by Ripple hehehe | Last post: December 03, 2024, 03:05:57 AM
Accessing board page: https://bitcointalk.org/index.php?board=67.640
Scraping topic: Solana b

In [15]:
df_altcoin_posts.info()
df_altcoin_posts.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658 entries, 0 to 657
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   timestamp   658 non-null    object
 1   post        658 non-null    object
 2   subject     658 non-null    object
 3   subsection  658 non-null    object
dtypes: object(4)
memory usage: 20.7+ KB


Unnamed: 0,timestamp,post,subject,subsection
0,2024-12-30,Now I've been in crypto since 2022. I noticed ...,Does intraday Trading actually works?,ADA
1,2024-12-30,"Quote from: Novachrono2022 on December 30, 202...",Does intraday Trading actually works?,ADA
2,2024-12-30,"Quote from: Novachrono2022 on December 30, 202...",Does intraday Trading actually works?,ADA
3,2024-12-30,"Quote from: Novachrono2022 on December 30, 202...",Does intraday Trading actually works?,ADA
4,2024-12-30,"Quote from: Novachrono2022 on December 30, 202...",Does intraday Trading actually works?,ADA


In [21]:
# combine dataframes
df_crypto_posts = pd.concat([df_bitcoin_posts, df_altcoin_posts], ignore_index=True)

# NLP

1. Text Cleaning
2. Sentiment & Polarity Acore Calculations

In [17]:
%pip install textblob
%pip install re

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement re (from versions: none)
ERROR: No matching distribution found for re


In [18]:
import pandas as pd
import re
from textblob import TextBlob

# Function to clean text
def clean_text(text):
    # Remove quoted messages (common in forum posts)
    text = re.sub(r'Quote from: .*?\n', '', text, flags=re.DOTALL)
    
    # Remove extra newlines and multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Function to compute sentiment polarity and subjectivity
def compute_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity


In [30]:
# Load DataFrame
df_crypto_posts['cleaned_post'] = df_crypto_posts['post'].apply(clean_text)

# Apply sentiment analysis on cleaned text
df_crypto_posts[['polarity', 'subjectivity']] = df_crypto_posts['cleaned_post'].apply(
    lambda txt: pd.Series(compute_sentiment(txt))
)

In [32]:
# Display the updated DataFrame
df_crypto_posts.info()
print(df_crypto_posts.head())

# Save to json
df_crypto_posts.to_json("crypto_posts.json", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526 entries, 0 to 1525
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     1526 non-null   object 
 1   post          1526 non-null   object 
 2   subject       1526 non-null   object 
 3   subsection    1526 non-null   object 
 4   cleaned_post  1526 non-null   object 
 5   polarity      1526 non-null   float64
 6   subjectivity  1526 non-null   float64
dtypes: float64(2), object(5)
memory usage: 83.6+ KB
    timestamp                                               post  \
0  2024-11-05  You've been holding Bitcoin for many years and...   
1  2024-11-05  Bitcoin has indeed exceeded my expectations. N...   
2  2024-11-05  I would say Bitcoin has exceeded my expectatio...   
3  2024-11-05  Quote from: Davidvictorson on November 05, 202...   
4  2024-11-05  Quote from: Davidvictorson on November 05, 202...   

                                             